fix: context health and eviction

Merge remote-tracking branch 'origin/main'
Merge branch 'feature/new-colony'
2026-04-15 11:40:45 -07:00 · 2026-04-14 19:56:33 -07:00 · 2026-04-14 19:56:08 -07:00 · 2026-04-14 18:51:14 -07:00 · 2026-04-14 18:02:49 -07:00 · 2026-04-14 16:34:30 -07:00
951 changed files with 139638 additions and 81563 deletions
@@ -1,9 +0,0 @@
-{
-  "mcpServers": {
-    "agent-builder": {
-      "command": "uv",
-      "args": ["run", "--directory", "core", "-m", "framework.mcp.agent_builder_server"],
-      "disabled": false
-    }
-  }
-}
@@ -1 +0,0 @@
-../../.claude/skills/hive
@@ -1 +0,0 @@
-../../.claude/skills/hive-concepts
@@ -1 +0,0 @@
-../../.claude/skills/hive-create
@@ -1 +0,0 @@
-../../.claude/skills/hive-credentials
@@ -1 +0,0 @@
-../../.claude/skills/hive-patterns
@@ -1 +0,0 @@
-../../.claude/skills/hive-test
@@ -1,5 +0,0 @@
---
-description: hive-concepts
---
-
-use hive-concepts skill
@@ -1,5 +0,0 @@
---
-description: hive-create
---
-
-use hive-create skill
@@ -1,5 +0,0 @@
---
-description: hive-credentials
---
-
-use hive-credentials skill
@@ -1,5 +0,0 @@
---
-description: hive-patterns
---
-
-use hive-patterns skill
@@ -1,5 +0,0 @@
---
-description: hive-test
---
-
-use hive-test skill
@@ -1,5 +0,0 @@
---
-description: hive
---
-
-use hive skill
@@ -1 +0,0 @@
-../../.claude/skills/hive
@@ -1 +0,0 @@
-../../.claude/skills/hive-concepts
@@ -1 +0,0 @@
-../../.claude/skills/hive-create
@@ -1 +0,0 @@
-../../.claude/skills/hive-credentials
@@ -1 +0,0 @@
-../../.claude/skills/hive-patterns
@@ -1 +0,0 @@
-../../.claude/skills/hive-test
@@ -1,4 +1,57 @@
 {
+  "permissions": {
+    "allow": [
+      "Bash(grep -n \"_is_context_too_large_error\" core/framework/agent_loop/agent_loop.py core/framework/agent_loop/internals/*.py)",
+      "Read(//^class/ {cls=$3} /def test_/**)",
+      "Read(//^    @pytest.mark.asyncio/{getline n; print NR\": \"n} /^    def test_/**)",
+      "Bash(python3)",
+      "Bash(grep -nE 'Tool\\\\\\(\\\\s*$|name=\"[a-z_]+\",' core/framework/tools/queen_lifecycle_tools.py)",
+      "Bash(awk -F'\"' '{print $2}')",
+      "Bash(grep -n \"create_colony\\\\|colony-spawn\\\\|colony_spawn\" /home/timothy/aden/hive/core/framework/agents/queen/nodes/__init__.py /home/timothy/aden/hive/core/framework/tools/*.py)",
+      "Bash(git stash:*)",
+      "Bash(python3 -c \"import sys,json; d=json.loads\\(sys.stdin.read\\(\\)\\); print\\('keys:', list\\(d.keys\\(\\)\\)[:10]\\)\")",
+      "Bash(python3 -c ':*)",
+      "Bash(uv run:*)",
+      "Read(//tmp/**)",
+      "Bash(grep -n \"useColony\\\\|const { queens, queenProfiles\" /home/timothy/aden/hive/core/frontend/src/pages/queen-dm.tsx)",
+      "Bash(awk 'NR==385,/\\\\}, \\\\[/' /home/timothy/aden/hive/core/frontend/src/pages/queen-dm.tsx)",
+      "Bash(xargs -I{} sh -c 'if ! grep -q \"^import base64\\\\|^from base64\" \"{}\"; then echo \"MISSING: {}\"; fi')",
+      "Bash(find /home/timothy/aden/hive/core/framework -name \"*.py\" -type f -exec grep -l \"FileConversationStore\\\\|class.*ConversationStore\" {} \\\\;)",
+      "Bash(find /home/timothy/aden/hive/core/framework -name \"*.py\" -exec grep -l \"run_parallel_workers\\\\|create_colony\" {} \\\\;)",
+      "Bash(awk '/^    async def execute\\\\\\(self, ctx: AgentContext\\\\\\)/,/^    async def [a-z_]+/ {print NR\": \"$0}' /home/timothy/aden/hive/core/framework/agent_loop/agent_loop.py)",
+      "Bash(grep -r \"max_concurrent_workers\\\\|max_depth\\\\|recursion\\\\|spawn.*bomb\" /home/timothy/aden/hive/core/framework/host/*.py)",
+      "Bash(wc -l /home/timothy/aden/hive/tools/src/gcu/browser/*.py /home/timothy/aden/hive/tools/src/gcu/browser/tools/*.py)",
+      "Bash(file /tmp/gcu_verify/*.png)",
+      "Bash(ps -eo pid,cmd)",
+      "Bash(ps -o pid,lstart,cmd -p 746640)",
+      "Bash(kill 746636)",
+      "Bash(ps -eo pid,lstart,cmd)",
+      "Bash(grep -E \"^d|\\\\.py$\")",
+      "Bash(grep -E \"\\\\.\\(ts|tsx\\)$\")",
+      "Bash(xargs cat:*)",
+      "Bash(find /home/timothy/aden/hive -path \"*/.venv\" -prune -o -name \"*.py\" -type f -exec grep -l \"frontend\\\\|UI\\\\|terminal\\\\|interactive\\\\|TUI\" {} \\\\;)",
+      "Bash(wc -l /home/timothy/.hive/backup/*/SKILL.md)",
+      "Bash(awk -F'::' '{print $1}')",
+      "Bash(wait)",
+      "Bash(pkill -f \"pytest.*test_event_loop_node\")",
+      "Bash(pkill -f \"pytest.*TestToolConcurrency\")",
+      "Bash(grep -n \"def.*discover\\\\|/api/agents\\\\|agents_discover\" /home/timothy/aden/hive/core/framework/server/*.py)",
+      "Bash(bun run:*)",
+      "Bash(npx eslint:*)",
+      "Bash(npm run:*)",
+      "Bash(npm test:*)",
+      "Bash(grep -n \"PIL\\\\|Image\\\\|to_thread\\\\|run_in_executor\" /home/timothy/aden/hive/tools/src/gcu/browser/*.py /home/timothy/aden/hive/tools/src/gcu/browser/tools/*.py)",
+      "WebFetch(domain:docs.litellm.ai)",
+      "Bash(cat /home/timothy/aden/hive/.venv/lib/python3.11/site-packages/litellm-*.dist-info/METADATA)",
+      "Bash(find \"/home/timothy/.hive/agents/queens/queen_brand_design/sessions/session_20260415_100751_d49f4c28/\" -type f -name \"*.json*\" -exec grep -l \"协日\" {} \\\\;)",
+      "Bash(grep -v ':0$')"
+    ],
+    "additionalDirectories": [
+      "/home/timothy/.hive/skills/writing-hive-skills",
+      "/tmp",
+      "/home/timothy/.hive/skills"
+    ]
+  },
  "hooks": {
    "PostToolUse": [
      {
@@ -1,34 +1,16 @@
 {
  "permissions": {
    "allow": [
-      "mcp__agent-builder__create_session",
-      "mcp__agent-builder__set_goal",
-      "mcp__agent-builder__add_node",
-      "mcp__agent-builder__add_edge",
-      "mcp__agent-builder__configure_loop",
-      "mcp__agent-builder__add_mcp_server",
-      "mcp__agent-builder__validate_graph",
-      "mcp__agent-builder__export_graph",
-      "mcp__agent-builder__load_session_by_id",
      "Bash(git status:*)",
      "Bash(gh run view:*)",
      "Bash(uv run:*)",
      "Bash(env:*)",
-      "mcp__agent-builder__test_node",
-      "mcp__agent-builder__list_mcp_tools",
      "Bash(python -m py_compile:*)",
      "Bash(python -m pytest:*)",
      "Bash(source:*)",
-      "mcp__agent-builder__update_node",
-      "mcp__agent-builder__check_missing_credentials",
-      "mcp__agent-builder__list_stored_credentials",
      "Bash(find:*)",
-      "mcp__agent-builder__run_tests",
-      "Bash(PYTHONPATH=core:exports:tools/src uv run pytest:*)",
-      "mcp__agent-builder__list_agent_sessions",
-      "mcp__agent-builder__generate_constraint_tests",
-      "mcp__agent-builder__generate_success_tests"
+      "Bash(PYTHONPATH=core:exports:tools/src uv run pytest:*)"
    ]
  },
-  "enabledMcpjsonServers": ["agent-builder", "tools"]
+  "enabledMcpjsonServers": ["tools"]
 }
@@ -0,0 +1,241 @@
+---
+name: browser-edge-cases
+description: SOP for debugging browser automation failures on complex websites. Use when browser tools fail on specific sites like LinkedIn, Twitter/X, SPAs, or sites with Shadow DOM.
+license: MIT
+---
+
+# Browser Tool Edge Cases
+
+Standard Operating Procedure for debugging and fixing browser automation failures on complex websites.
+
+## When to Use This Skill
+
+- `browser_scroll` succeeds but page doesn't move
+- `browser_click` succeeds but no action triggered
+- `browser_type` text disappears or doesn't work
+- `browser_snapshot` hangs or returns stale content
+- `browser_navigate` loads wrong content
+
+## SOP: Debugging Browser Tool Failures
+
+### Phase 1: Reproduce & Isolate
+
+```
+1. Create minimal test case demonstrating failure
+2. Test against simple site (example.com) to verify tool works
+3. Test against problematic site to confirm issue
+```
+
+**Quick isolation test:**
+```python
+# Test 1: Does the tool work at all?
+await browser_navigate(tab_id, "https://example.com")
+result = await browser_scroll(tab_id, "down", 100)
+# Should work on simple sites
+
+# Test 2: Does it fail on the problematic site?
+await browser_navigate(tab_id, "https://linkedin.com/feed")
+result = await browser_scroll(tab_id, "down", 100)
+# If this fails but example.com works → site-specific edge case
+```
+
+### Phase 2: Analyze Root Cause
+
+**Step 2a: Check console for errors**
+```python
+console = await browser_console(tab_id)
+# Look for: CSP violations, React errors, JavaScript exceptions
+```
+
+**Step 2b: Inspect DOM structure**
+```python
+html = await browser_html(tab_id)
+snapshot = await browser_snapshot(tab_id)
+# Look for:
+# - Nested scrollable divs (overflow: scroll/auto)
+# - Shadow DOM roots
+# - iframes
+# - Custom widgets
+```
+
+**Step 2c: Identify the pattern**
+
+| Symptom | Likely Cause | Check |
+|---------|--------------|-------|
+| Scroll doesn't move | Nested scroll container | Look for `overflow: scroll` divs |
+| Click no effect | Element covered | Check `getBoundingClientRect` vs viewport |
+| Type clears | Autocomplete/React | Check for event listeners on input |
+| Snapshot hangs | Huge DOM | Check node count in snapshot |
+| Snapshot stale | SPA hydration | Wait after navigation |
+
+### Phase 3: Implement Multi-Layer Fix
+
+**Pattern: Always have fallbacks**
+
+```python
+async def robust_operation(tab_id):
+    # Method 1: Primary approach
+    try:
+        result = await primary_method(tab_id)
+        if verify_success(result):
+            return result
+    except Exception:
+        pass
+
+    # Method 2: CDP fallback
+    try:
+        result = await cdp_fallback(tab_id)
+        if verify_success(result):
+            return result
+    except Exception:
+        pass
+
+    # Method 3: JavaScript fallback
+    return await javascript_fallback(tab_id)
+```
+
+**Pattern: Always add timeouts**
+
+```python
+# Bad - can hang forever
+result = await browser_snapshot(tab_id)
+
+# Good - fails fast with useful error
+try:
+    result = await browser_snapshot(tab_id, timeout_s=10.0)
+except asyncio.TimeoutError:
+    # Handle timeout gracefully
+    result = await fallback_snapshot(tab_id)
+```
+
+### Phase 4: Verify Fix
+
+```
+1. Run against problematic site → should work
+2. Run against simple site → should still work (regression check)
+3. Document in registry.md
+```
+
+## Pattern Library
+
+### P1: Nested Scrollable Containers
+
+**Sites:** LinkedIn, Twitter/X, any SPA with scrollable feeds
+
+**Detection:**
+```javascript
+// Find largest scrollable container
+const candidates = [];
+document.querySelectorAll('*').forEach(el => {
+    const style = getComputedStyle(el);
+    if (style.overflow.includes('scroll') || style.overflow.includes('auto')) {
+        const rect = el.getBoundingClientRect();
+        if (rect.width > 100 && rect.height > 100) {
+            candidates.push({el, area: rect.width * rect.height});
+        }
+    }
+});
+candidates.sort((a, b) => b.area - a.area);
+return candidates[0]?.el;
+```
+
+**Fix:** Dispatch scroll events at container's center, not viewport center.
+
+### P2: Element Covered by Overlay
+
+**Sites:** Modals, tooltips, SPAs with loading overlays
+
+**Detection:**
+```javascript
+const rect = element.getBoundingClientRect();
+const centerX = rect.left + rect.width / 2;
+const centerY = rect.top + rect.height / 2;
+const topElement = document.elementFromPoint(centerX, centerY);
+return topElement === element || element.contains(topElement);
+```
+
+**Fix:** Wait for overlay to disappear, or use JavaScript click.
+
+### P3: React Synthetic Events
+
+**Sites:** React SPAs, modern web apps
+
+**Detection:** If CDP click doesn't trigger handler but manual click works.
+
+**Fix:** Use JavaScript click as primary:
+```javascript
+element.click();
+```
+
+### P4: Huge DOM / Accessibility Tree
+
+**Sites:** LinkedIn, Facebook, Twitter (feeds with 1000s of nodes)
+
+**Detection:**
+```javascript
+document.querySelectorAll('*').length > 5000
+```
+
+**Fix:**
+1. Add timeout to snapshot operation
+2. Truncate tree at 2000 nodes
+3. Fall back to DOM-based snapshot if accessibility tree too large
+
+### P5: SPA Hydration Delay
+
+**Sites:** React, Vue, Angular SPAs after navigation
+
+**Detection:**
+```javascript
+// Check if React app has hydrated
+document.querySelector('[data-reactroot]') ||
+document.querySelector('[data-reactid]')
+```
+
+**Fix:** Wait for specific selector after navigation:
+```python
+await browser_navigate(tab_id, url, wait_until="load")
+await browser_wait(tab_id, selector='[data-testid="content"]', timeout_ms=5000)
+```
+
+### P6: Shadow DOM
+
+**Sites:** Components using Shadow DOM, Lit elements
+
+**Detection:**
+```javascript
+document.querySelectorAll('*').some(el => el.shadowRoot)
+```
+
+**Fix:** Pierce shadow root:
+```javascript
+function queryShadow(selector) {
+    const parts = selector.split('>>>');
+    let node = document;
+    for (const part of parts) {
+        if (node.shadowRoot) {
+            node = node.shadowRoot.querySelector(part.trim());
+        } else {
+            node = node.querySelector(part.trim());
+        }
+    }
+    return node;
+}
+```
+
+## Quick Reference
+
+| Issue | Primary Fix | Fallback |
+|-------|-------------|----------|
+| Scroll not working | Find scrollable container | Mouse wheel at container center |
+| Click no effect | JavaScript click() | CDP mouse events |
+| Type clears | Add delay_ms | Use execCommand |
+| Snapshot hangs | Add timeout_s | DOM snapshot fallback |
+| Stale content | Wait for selector | Increase wait_until timeout |
+| Shadow DOM | Pierce selector | JavaScript traversal |
+
+## References
+
+- [registry.md](registry.md) - Full list of known edge cases
+- [scripts/test_case.py](scripts/test_case.py) - Template for testing new cases
+- [BROWSER_USE_PATTERNS.md](../../tools/BROWSER_USE_PATTERNS.md) - Implementation patterns from browser-use
@@ -0,0 +1,261 @@
+# Browser Edge Case Registry
+
+Curated list of known browser automation edge cases with symptoms, causes, and fixes.
+
+---
+
+## Scroll Issues
+
+### #1: LinkedIn Nested Scroll Container
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | LinkedIn (linkedin.com/feed) |
+| **Symptom** | `browser_scroll()` returns `{ok: true}` but page doesn't move |
+| **Root Cause** | Content is in a nested scrollable div (`overflow: scroll`), not the main window |
+| **Detection** | `document.querySelectorAll('*')` with `overflow: scroll/auto` has large candidates |
+| **Fix** | JavaScript finds largest scrollable container, uses `container.scrollBy()` |
+| **Code** | `bridge.py:808-891` - smart scroll with container detection |
+| **Verified** | 2026-04-03 ✓ |
+
+### #2: Twitter/X Lazy Loading
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Twitter/X (x.com) |
+| **Symptom** | Infinite scroll doesn't load new content |
+| **Root Cause** | Lazy loading requires content to be visible before loading more |
+| **Detection** | Scroll position at bottom but no new `[data-testid="tweet"]` elements |
+| **Fix** | Add `wait_for_selector` between scroll calls with 1s delay |
+| **Code** | Test file: `tests/test_x_page_load_repro.py` |
+| **Verified** | - |
+
+### #3: Modal/Dialog Scroll Container
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Any site with modal dialogs |
+| **Symptom** | Scroll scrolls background page, not modal content |
+| **Root Cause** | Modal has its own scroll container with `overflow: scroll` |
+| **Detection** | Visible element with `position: fixed` and scrollable content |
+| **Fix** | Find visible modal container (highest z-index scrollable), scroll that |
+| **Code** | - |
+| **Verified** | - |
+
+---
+
+## Click Issues
+
+### #4: Element Covered by Overlay
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | SPAs, sites with loading overlays |
+| **Symptom** | Click succeeds but no action triggered |
+| **Root Cause** | Element is covered by transparent overlay, tooltip, or iframe |
+| **Detection** | `document.elementFromPoint(x, y) !== target` |
+| **Fix** | Wait for overlay to disappear, or use JavaScript `element.click()` |
+| **Code** | `bridge.py:394-591` - JavaScript click as primary |
+| **Verified** | - |
+
+### #5: React Synthetic Events
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | React applications |
+| **Symptom** | CDP click doesn't trigger React handler |
+| **Root Cause** | React uses synthetic events that don't respond to CDP events |
+| **Detection** | Site uses React (check for `__reactFiber$` or `data-reactroot`) |
+| **Fix** | Use JavaScript `element.click()` as primary method |
+| **Code** | `bridge.py:394-591` - JavaScript-first click |
+| **Verified** | - |
+
+### #6: Shadow DOM Elements
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Components using Shadow DOM, Lit elements |
+| **Symptom** | `querySelector` can't find element |
+| **Root Cause** | Element is inside a shadow root, not main DOM tree |
+| **Detection** | `element.shadowRoot !== null` on parent elements |
+| **Fix** | Use piercing selector (`host >>> target`) or traverse shadow roots |
+| **Code** | See SKILL.md P6 pattern |
+| **Verified** | 2026-04-03 ✓ |
+
+---
+
+## Input Issues
+
+### #7: ContentEditable / Rich Text Editors
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Rich text editors (Notion, Slack web, etc.) |
+| **Symptom** | `browser_type()` doesn't insert text |
+| **Root Cause** | Element is `contenteditable`, not an `<input>` or `<textarea>` |
+| **Detection** | `element.contentEditable === 'true'` |
+| **Fix** | Focus via JavaScript, use `execCommand('insertText')` or `Input.dispatchKeyEvent` |
+| **Code** | `bridge.py:616-694` - contentEditable handling |
+| **Verified** | 2026-04-03 ✓ |
+
+### #8: Autocomplete Field Clearing
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Search fields with autocomplete, address forms |
+| **Symptom** | Typed text gets cleared immediately |
+| **Root Cause** | Field expects realistic keystroke timing for autocomplete |
+| **Detection** | Field has autocomplete listeners or dropdown appears |
+| **Fix** | Add `delay_ms=50` between keystrokes |
+| **Code** | `bridge.py:type()` - delay_ms parameter |
+| **Verified** | 2026-04-03 ✓ |
+
+### #9: Custom Date Pickers
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Forms with custom date widgets |
+| **Symptom** | Can't type date into date field |
+| **Root Cause** | Custom widget intercepts and blocks keyboard input |
+| **Detection** | Typing doesn't change field value |
+| **Fix** | Click calendar widget icon, select date from dropdown |
+| **Code** | - |
+| **Verified** | - |
+
+---
+
+## Snapshot Issues
+
+### #10: LinkedIn Huge DOM Tree
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | LinkedIn, Facebook, Twitter feeds |
+| **Symptom** | `browser_snapshot()` hangs forever |
+| **Root Cause** | 10k+ DOM nodes, accessibility tree has 50k+ nodes |
+| **Detection** | `document.querySelectorAll('*').length > 5000` |
+| **Fix** | Add `timeout_s` param with `asyncio.timeout()`, proper error handling |
+| **Code** | `bridge.py:1041-1028` - snapshot with timeout protection |
+| **Verified** | 2026-04-03 ✓ (0.08s on LinkedIn) |
+
+### #11: SPA Hydration Delay
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | React/Vue/Angular SPAs |
+| **Symptom** | Snapshot shows old content after navigation |
+| **Root Cause** | Client-side hydration hasn't completed when snapshot runs |
+| **Detection** | `document.readyState === 'complete'` but content missing |
+| **Fix** | Wait for specific selector after navigation |
+| **Code** | Test file: `tests/test_x_page_load_repro.py` |
+| **Verified** | - |
+
+### #12: iframe Content Missing
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Sites with embedded content |
+| **Symptom** | Snapshot missing iframe content |
+| **Root Cause** | Accessibility tree doesn't include iframe content |
+| **Detection** | `document.querySelectorAll('iframe')` has results |
+| **Fix** | Use `DOM.getFrameOwner` + separate snapshot for each iframe |
+| **Code** | - |
+| **Verified** | - |
+
+---
+
+## Navigation Issues
+
+### #13: SPA Navigation Events
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | React Router, Vue Router SPAs |
+| **Symptom** | `wait_until="load"` fires before content ready |
+| **Root Cause** | SPA uses client-side routing, no full page load |
+| **Detection** | URL changes but `load` event already fired |
+| **Fix** | Use `wait_until="networkidle"` or `wait_for_selector` |
+| **Code** | `bridge.py:navigate()` - wait_until options |
+| **Verified** | - |
+
+### #14: Cross-Origin Redirects
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | OAuth flows, SSO logins |
+| **Symptom** | Navigation fails during redirect |
+| **Root Cause** | Cross-origin security prevents CDP tracking |
+| **Detection** | URL changes to different domain |
+| **Fix** | Use `wait_for_url` with pattern matching instead of exact URL |
+| **Code** | - |
+| **Verified** | - |
+
+---
+
+## Screenshot Issues
+
+### #15: Selector Screenshot Not Implemented
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Any site |
+| **Symptom** | `browser_screenshot(selector="h1")` takes full viewport instead of element |
+| **Root Cause** | `selector` param existed in signature but was silently ignored in both `bridge.py` and `inspection.py` |
+| **Detection** | Screenshot with selector same byte size as screenshot without selector |
+| **Fix** | Use CDP `Runtime.evaluate` to call `getBoundingClientRect()` on the element, pass result as `clip` to `Page.captureScreenshot` |
+| **Code** | `bridge.py:1315-1344` - selector clip logic; `inspection.py:94-96` - pass selector to bridge |
+| **Verified** | 2026-04-03 ✓ (JS rect query returns correct viewport coords; requires server restart) |
+
+### #16: Stale Browser Context (Group ID Mismatch)
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | Any |
+| **Symptom** | `browser_open()` returns `"No group with id: XXXXXXX"` even though `browser_status` shows `running: true` |
+| **Root Cause** | In-memory `_contexts` dict has a stale `groupId` from a Chrome tab group that was closed outside the tool (e.g. user closed the tab group) |
+| **Detection** | `browser_status` returns `running: true` but `browser_open` fails with "No group with id" |
+| **Fix** | Call `browser_stop()` to clear stale context from `_contexts`, then `browser_start()` again |
+| **Code** | `tools/lifecycle.py:144-160` - `already_running` check uses cached dict without validating against Chrome |
+| **Verified** | 2026-04-03 ✓ |
+
+---
+
+## How to Add New Edge Cases
+
+1. **Reproduce** the issue with minimal test case
+2. **Document** using the template below
+3. **Implement** fix with multi-layer fallback
+4. **Verify** against both problematic and simple sites
+5. **Submit** by appending to this file
+
+### Template
+
+```markdown
+### #N: [Short Title]
+
+| Attribute | Value |
+|-----------|-------|
+| **Site** | [URL or site type] |
+| **Symptom** | [What the user observes] |
+| **Root Cause** | [Technical explanation] |
+| **Detection** | [JavaScript to detect this case] |
+| **Fix** | [Solution approach] |
+| **Code** | [File:line reference if implemented] |
+| **Verified** | [Date or "pending"] |
+```
+
+---
+
+## Statistics
+
+| Category | Count |
+|----------|-------|
+| Scroll Issues | 3 |
+| Click Issues | 3 |
+| Input Issues | 3 |
+| Snapshot Issues | 3 |
+| Navigation Issues | 2 |
+| Screenshot Issues | 2 |
+| **Total** | **16** |
+
+Last updated: 2026-04-03
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+"""
+Test #2: Twitter/X Lazy Loading Scroll
+
+Symptom: Infinite scroll doesn't load new content
+Root Cause: Lazy loading requires content to be visible before loading more
+Fix: Add wait_for_selector between scroll calls
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+BRIDGE_PORT = 9229
+CONTEXT_NAME = "twitter-scroll-test"
+
+
+async def test_twitter_lazy_scroll():
+    """Test that repeated scrolls with waits load new content."""
+    print("=" * 70)
+    print("TEST #2: Twitter/X Lazy Loading Scroll")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+            print(f"Waiting for extension... ({i + 1}/10)")
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Navigate to Twitter/X
+        print("\n--- Navigating to X.com ---")
+        await bridge.navigate(tab_id, "https://x.com", wait_until="networkidle", timeout_ms=30000)
+        print("✓ Page loaded")
+
+        # Wait for tweets to appear
+        print("\n--- Waiting for tweets ---")
+        await bridge.wait_for_selector(tab_id, '[data-testid="tweet"]', timeout_ms=10000)
+
+        # Count initial tweets
+        initial_count = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.querySelectorAll("
+            "'[data-testid=\"tweet\"]').length; })()",
+        )
+        print(f"Initial tweet count: {initial_count.get('result', 0)}")
+
+        # Take screenshot of initial state
+        screenshot = await bridge.screenshot(tab_id)
+        print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+
+        # Scroll multiple times with waits
+        print("\n--- Scrolling with waits ---")
+        for i in range(3):
+            result = await bridge.scroll(tab_id, "down", 500)
+            print(f"  Scroll {i + 1}: {result.get('method', 'unknown')} method")
+
+            # Wait for new content to load
+            await asyncio.sleep(2)
+
+            # Count tweets after scroll
+            count_result = await bridge.evaluate(
+                tab_id,
+                "(function() { return document.querySelectorAll("
+                "'[data-testid=\"tweet\"]').length; })()",
+            )
+            count = count_result.get("result", 0)
+            print(f"  Tweet count after scroll: {count}")
+
+        # Final count
+        final_count = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.querySelectorAll("
+            "'[data-testid=\"tweet\"]').length; })()",
+        )
+        final = final_count.get("result", 0)
+        initial = initial_count.get("result", 0)
+
+        print("\n--- Results ---")
+        print(f"Initial tweets: {initial}")
+        print(f"Final tweets: {final}")
+
+        if final > initial:
+            print(f"✓ PASS: Loaded {final - initial} new tweets")
+        else:
+            print("✗ FAIL: No new tweets loaded (may need login)")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_twitter_lazy_scroll())
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+"""
+Test #3: Modal/Dialog Scroll Container
+
+Symptom: Scroll scrolls background page, not modal content
+Root Cause: Modal has its own scroll container with overflow: scroll
+Fix: Find visible modal container (highest z-index scrollable), scroll that
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+BRIDGE_PORT = 9229
+CONTEXT_NAME = "modal-scroll-test"
+
+# Test site with modal - using a demo site
+MODAL_DEMO_URL = "https://www.w3schools.com/howto/howto_css_modals.asp"
+
+
+async def test_modal_scroll():
+    """Test that scroll targets modal content, not background."""
+    print("=" * 70)
+    print("TEST #3: Modal/Dialog Scroll Container")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Navigate to modal demo
+        print("\n--- Navigating to modal demo ---")
+        await bridge.navigate(tab_id, MODAL_DEMO_URL, wait_until="load")
+        print("✓ Page loaded")
+
+        # Take screenshot before
+        screenshot_before = await bridge.screenshot(tab_id)
+        print(f"Screenshot before: {len(screenshot_before.get('data', ''))} bytes")
+
+        # Click button to open modal
+        print("\n--- Opening modal ---")
+        # Find and click the "Open Modal" button
+        result = await bridge.click(tab_id, ".ws-btn", timeout_ms=5000)
+        print(f"Click result: {result}")
+
+        await asyncio.sleep(1)
+
+        # Take screenshot with modal open
+        screenshot_modal = await bridge.screenshot(tab_id)
+        print(f"Screenshot modal open: {len(screenshot_modal.get('data', ''))} bytes")
+
+        # Try to scroll within modal
+        print("\n--- Scrolling modal content ---")
+        result = await bridge.scroll(tab_id, "down", 100)
+        print(f"Scroll result: {result}")
+
+        await asyncio.sleep(0.5)
+
+        # Take screenshot after scroll
+        screenshot_after = await bridge.screenshot(tab_id)
+        print(f"Screenshot after scroll: {len(screenshot_after.get('data', ''))} bytes")
+
+        # Check if modal content scrolled (not background)
+        # This is a visual check - we can verify by comparing screenshots
+        print("\n--- Results ---")
+        print(f"Modal scroll test completed. Method used: {result.get('method', 'unknown')}")
+        print("Visual verification needed: Check if modal content scrolled vs background")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_modal_scroll())
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+"""
+Test #4: Element Covered by Overlay
+
+Symptom: Click succeeds but no action triggered
+Root Cause: Element is covered by transparent overlay, tooltip, or iframe
+Detection: document.elementFromPoint(x, y) !== target
+Fix: Wait for overlay to disappear, or use JavaScript element.click()
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "overlay-click-test"
+
+
+async def test_overlay_click():
+    """Test clicking elements that are covered by overlays."""
+    print("=" * 70)
+    print("TEST #4: Element Covered by Overlay")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Create a test page with overlay
+        print("\n--- Creating test page with overlay ---")
+        test_html = """
+        <!DOCTYPE html>
+        <html>
+        <head><title>Overlay Test</title></head>
+        <body>
+            <button id="target-btn" onclick="alert('Clicked!')">Click Me</button>
+            <div id="overlay" style="position:fixed;top:0;left:0;
+            width:100%;height:100%;
+            background:rgba(0,0,0,0.3);z-index:1000;"></div>
+            <script>
+                window.clickCount = 0;
+                document.getElementById('target-btn').addEventListener('click', () => {
+                    window.clickCount++;
+                });
+            </script>
+        </body>
+        </html>
+        """
+
+        # Navigate to data URL
+        import base64
+
+        data_url = f"data:text/html;base64,{base64.b64encode(test_html.encode()).decode()}"
+        await bridge.navigate(tab_id, data_url, wait_until="load")
+
+        # Screenshot before
+        screenshot = await bridge.screenshot(tab_id)
+        print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+
+        # Try to click the covered button
+        print("\n--- Attempting to click covered button ---")
+
+        # First, check if element is covered
+        coverage_check = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const btn = document.getElementById('target-btn');
+                const rect = btn.getBoundingClientRect();
+                const centerX = rect.left + rect.width / 2;
+                const centerY = rect.top + rect.height / 2;
+                const topElement = document.elementFromPoint(centerX, centerY);
+                return {
+                    isCovered: topElement !== btn && !btn.contains(topElement),
+                    topElement: topElement?.tagName,
+                    targetElement: btn.tagName
+                };
+            })();
+        """,
+        )
+        print(f"Coverage check: {coverage_check.get('result', {})}")
+
+        # Try CDP click (may fail due to overlay)
+        click_result = await bridge.click(tab_id, "#target-btn", timeout_ms=5000)
+        print(f"Click result: {click_result}")
+
+        # Check if click registered
+        count_result = await bridge.evaluate(tab_id, "(function() { return window.clickCount; })()")
+        count = count_result.get("result", 0)
+        print(f"Click count after CDP click: {count}")
+
+        if count > 0:
+            print("✓ PASS: JavaScript click penetrated overlay")
+        else:
+            print("✗ FAIL: Click did not reach button (overlay blocked it)")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_overlay_click())
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+"""
+Test #6: Shadow DOM Elements
+
+Symptom: querySelector can't find element
+Root Cause: Element is inside a shadow root, not main DOM tree
+Detection: element.shadowRoot !== null on parent elements
+Fix: Use piercing selector (host >>> target) or traverse shadow roots
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "shadow-dom-test"
+
+
+async def test_shadow_dom():
+    """Test clicking elements inside Shadow DOM."""
+    print("=" * 70)
+    print("TEST #6: Shadow DOM Elements")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Create test page with Shadow DOM
+        print("\n--- Creating test page with Shadow DOM ---")
+        test_html = """
+        <!DOCTYPE html>
+        <html>
+        <head><title>Shadow DOM Test</title></head>
+        <body>
+            <div id="shadow-host"></div>
+            <script>
+                const host = document.getElementById('shadow-host');
+                const shadow = host.attachShadow({ mode: 'open' });
+                shadow.innerHTML = `
+                    <style>
+                        button { padding: 10px 20px; font-size: 16px; }
+                    </style>
+                    <button id="shadow-btn">Shadow Button</button>
+                `;
+                shadow.getElementById('shadow-btn').addEventListener('click', () => {
+                    window.shadowClickCount = (window.shadowClickCount || 0) + 1;
+                    console.log('Shadow button clicked:', window.shadowClickCount);
+                });
+            </script>
+        </body>
+        </html>
+        """
+
+        # Write to file and use file:// URL (data: URLs don't work well with extension)
+        test_file = Path("/tmp/shadow_dom_test.html")
+        test_file.write_text(test_html.strip())
+        file_url = f"file://{test_file}"
+        await bridge.navigate(tab_id, file_url, wait_until="load")
+        print("✓ Page loaded")
+
+        # Screenshot
+        screenshot = await bridge.screenshot(tab_id)
+        print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+
+        # Detect Shadow DOM
+        print("\n--- Detecting Shadow DOM ---")
+        detection = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const hosts = [];
+                document.querySelectorAll('*').forEach(el => {
+                    if (el.shadowRoot) {
+                        hosts.push({
+                            tag: el.tagName,
+                            id: el.id,
+                            hasButton: el.shadowRoot.querySelector('button') !== null
+                        });
+                    }
+                });
+                return { count: hosts.length, hosts };
+            })();
+        """,
+        )
+        print(f"Shadow DOM detection: {detection.get('result', {})}")
+
+        # Try to click shadow button using regular selector (should fail)
+        print("\n--- Attempting click with regular selector ---")
+        try:
+            result = await bridge.click(tab_id, "#shadow-btn", timeout_ms=3000)
+            print(f"Result: {result}")
+        except Exception as e:
+            print(f"Expected failure: {e}")
+
+        # Try to click using JavaScript that pierces shadow DOM
+        print("\n--- Clicking via JavaScript shadow piercing ---")
+        click_result = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const host = document.getElementById('shadow-host');
+                const btn = host.shadowRoot.getElementById('shadow-btn');
+                if (btn) {
+                    btn.click();
+                    return { success: true, clicked: 'shadow-btn' };
+                }
+                return { success: false, error: 'Button not found' };
+            })();
+        """,
+        )
+        print(f"JS click result: {click_result.get('result', {})}")
+
+        # Verify click was registered
+        count_result = await bridge.evaluate(
+            tab_id, "(function() { return window.shadowClickCount || 0; })()"
+        )
+        count = count_result.get("result") or 0
+        print(f"Shadow click count: {count}")
+
+        if count and count > 0:
+            print("✓ PASS: Shadow DOM element clicked successfully")
+        else:
+            print("✗ FAIL: Could not click Shadow DOM element")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_shadow_dom())
@@ -0,0 +1,180 @@
+#!/usr/bin/env python
+"""
+Test #7: ContentEditable / Rich Text Editors
+
+Symptom: browser_type() doesn't insert text
+Root Cause: Element is contenteditable, not an <input> or <textarea>
+Detection: element.contentEditable === 'true'
+Fix: Focus via JavaScript, use execCommand('insertText') or Input.dispatchKeyEvent
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "contenteditable-test"
+
+
+async def test_contenteditable():
+    """Test typing into contenteditable elements."""
+    print("=" * 70)
+    print("TEST #7: ContentEditable / Rich Text Editors")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Create test page with contenteditable
+        test_html = """
+        <!DOCTYPE html>
+        <html>
+        <head><title>ContentEditable Test</title></head>
+        <body>
+            <h2>ContentEditable Test</h2>
+
+            <h3>1. Simple contenteditable div</h3>
+            <div id="editor1" contenteditable="true"
+            style="border:1px solid #ccc;padding:10px;
+            min-height:50px;">Start text</div>
+
+            <h3>2. Rich text editor (like Notion)</h3>
+            <div id="editor2" contenteditable="true"
+            style="border:1px solid #ccc;padding:10px;
+            min-height:50px;">
+                <p>Type here...</p>
+            </div>
+
+            <h3>3. Regular input (for comparison)</h3>
+            <input id="input1" type="text" placeholder="Regular input" />
+
+            <script>
+                // Track content changes
+                window.editor1Content = '';
+                window.editor2Content = '';
+
+                document.getElementById('editor1').addEventListener('input', (e) => {
+                    window.editor1Content = e.target.innerText;
+                });
+                document.getElementById('editor2').addEventListener('input', (e) => {
+                    window.editor2Content = e.target.innerText;
+                });
+            </script>
+        </body>
+        </html>
+        """
+
+        # Write to file and use file:// URL (data: URLs don't work well with extension)
+        test_file = Path("/tmp/contenteditable_test.html")
+        test_file.write_text(test_html.strip())
+        file_url = f"file://{test_file}"
+        await bridge.navigate(tab_id, file_url, wait_until="load")
+        print("✓ Page loaded")
+
+        # Screenshot with timeout protection
+        try:
+            screenshot = await asyncio.wait_for(bridge.screenshot(tab_id), timeout=10.0)
+            print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+        except asyncio.TimeoutError:
+            print("Screenshot timed out (skipping)")
+
+        # Detect contenteditable
+        print("\n--- Detecting contenteditable elements ---")
+        detection = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const editables = document.querySelectorAll('[contenteditable="true"]');
+                return {
+                    count: editables.length,
+                    ids: Array.from(editables).map(el => el.id)
+                };
+            })();
+        """,
+        )
+        print(f"Contenteditable detection: {detection.get('result', {})}")
+
+        # Test 1: Type into regular input (baseline)
+        print("\n--- Test 1: Regular input ---")
+        await bridge.click(tab_id, "#input1")
+        await bridge.type_text(tab_id, "#input1", "Hello input")
+        input_result = await bridge.evaluate(
+            tab_id, "(function() { return document.getElementById('input1').value; })()"
+        )
+        print(f"Input value: {input_result.get('result', '')}")
+
+        # Test 2: Type into contenteditable div
+        print("\n--- Test 2: Contenteditable div ---")
+        await bridge.click(tab_id, "#editor1")
+        await bridge.type_text(tab_id, "#editor1", "Hello contenteditable", clear_first=True)
+        editor_result = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.getElementById('editor1').innerText; })()",
+        )
+        print(f"Editor1 innerText: {editor_result.get('result', '')}")
+
+        # Test 3: Use JavaScript insertText for rich editor
+        print("\n--- Test 3: JavaScript insertText for rich editor ---")
+        insert_result = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const editor = document.getElementById('editor2');
+                editor.focus();
+                document.execCommand('selectAll', false, null);
+                document.execCommand('insertText', false, 'Hello from execCommand');
+                return editor.innerText;
+            })();
+        """,
+        )
+        print(f"Editor2 after execCommand: {insert_result.get('result', '')}")
+
+        # Screenshot after with timeout protection
+        try:
+            screenshot_after = await asyncio.wait_for(bridge.screenshot(tab_id), timeout=10.0)
+            print(f"Screenshot after: {len(screenshot_after.get('data', ''))} bytes")
+        except asyncio.TimeoutError:
+            print("Screenshot after timed out (skipping)")
+
+        # Results
+        print("\n--- Results ---")
+        input_val = input_result.get("result", "")
+        editor1_val = editor_result.get("result", "")
+        editor2_val = insert_result.get("result", "")
+
+        input_pass = "Hello input" in input_val
+        editor1_pass = "Hello contenteditable" in editor1_val
+        editor2_pass = "execCommand" in editor2_val
+
+        print(f"Input: {'✓ PASS' if input_pass else '✗ FAIL'} - {input_val}")
+        print(f"Editor1: {'✓ PASS' if editor1_pass else '✗ FAIL'} - {editor1_val}")
+        print(f"Editor2: {'✓ PASS' if editor2_pass else '✗ FAIL'} - {editor2_val}")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_contenteditable())
@@ -0,0 +1,253 @@
+#!/usr/bin/env python
+"""
+Test #8: Autocomplete Field Clearing
+
+Symptom: Typed text gets cleared immediately
+Root Cause: Field expects realistic keystroke timing for autocomplete
+Detection: Field has autocomplete listeners or dropdown appears
+Fix: Add delay_ms between keystrokes
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "autocomplete-test"
+
+
+async def test_autocomplete():
+    """Test typing into fields with autocomplete behavior."""
+    print("=" * 70)
+    print("TEST #8: Autocomplete Field Clearing")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Create test page with autocomplete behavior
+        test_html = """
+        <!DOCTYPE html>
+        <html>
+        <head><title>Autocomplete Test</title>
+        <style>
+            .autocomplete-items {
+                position: absolute;
+                border: 1px solid #d4d4d4;
+                border-top: none;
+                z-index: 99;
+                top: 100%;
+                left: 0;
+                right: 0;
+                max-height: 200px;
+                overflow-y: auto;
+                background: white;
+            }
+            .autocomplete-items div {
+                padding: 10px;
+                cursor: pointer;
+            }
+            .autocomplete-items div:hover {
+                background-color: #e9e9e9;
+            }
+            .autocomplete-active {
+                background-color: DodgerBlue !important;
+                color: white;
+            }
+            .autocomplete { position: relative; display: inline-block; }
+            input { width: 300px; padding: 10px; font-size: 16px; }
+        </style></head>
+        <body>
+            <h2>Autocomplete Test</h2>
+
+            <div class="autocomplete">
+                <input id="search" type="text" placeholder="Search countries..." autocomplete="off">
+            </div>
+
+            <div id="log" style="margin-top:20px;font-family:monospace;"></div>
+
+            <script>
+                const countries = [
+                    "Afghanistan","Albania","Algeria",
+                    "Andorra","Angola","Argentina",
+                    "Armenia","Australia","Austria",
+                    "Azerbaijan","Bahamas","Bahrain",
+                    "Bangladesh","Belarus","Belgium",
+                    "Belize","Benin","Bhutan",
+                    "Bolivia","Brazil","Canada",
+                    "China","Colombia","Denmark",
+                    "Egypt","France","Germany",
+                    "India","Indonesia","Italy",
+                    "Japan","Mexico","Netherlands",
+                    "Nigeria","Norway","Pakistan",
+                    "Peru","Philippines","Poland",
+                    "Portugal","Russia","Spain",
+                    "Sweden","Switzerland","Thailand",
+                    "Turkey","Ukraine",
+                    "United Kingdom","United States",
+                    "Vietnam"
+                ];
+
+                const input = document.getElementById('search');
+                const log = document.getElementById('log');
+                let currentFocus = -1;
+                let typingTimeout = null;
+
+                // Track events for testing
+                window.inputEvents = [];
+                window.inputValue = '';
+
+                function logEvent(type, value) {
+                    window.inputEvents.push({ type, value, time: Date.now() });
+                    const entry = document.createElement('div');
+                    entry.textContent = type + ': ' + value;
+                    log.insertBefore(entry, log.firstChild);
+                }
+
+                // Simulate autocomplete that clears fast typing
+                input.addEventListener('input', function(e) {
+                    const val = this.value;
+
+                    // Clear previous dropdown
+                    closeAllLists();
+
+                    if (!val) return;
+
+                    // If typing too fast (autocomplete-style), clear and restart
+                    clearTimeout(typingTimeout);
+                    typingTimeout = setTimeout(() => {
+                        logEvent('input', val);
+                        window.inputValue = val;
+
+                        // Create dropdown
+                        const div = document.createElement('div');
+                        div.setAttribute('id', this.id + 'autocomplete-list');
+                        div.setAttribute('class', 'autocomplete-items');
+                        this.parentNode.appendChild(div);
+
+                        countries.filter(
+                            c => c.substr(0, val.length).toUpperCase()
+                                === val.toUpperCase()
+                        ).slice(0, 5).forEach(country => {
+                                const item = document.createElement('div');
+                                item.innerHTML = '<strong>'
+                                    + country.substr(0, val.length)
+                                    + '</strong>'
+                                    + country.substr(val.length);
+                                item.addEventListener('click', function() {
+                                    input.value = country;
+                                    closeAllLists();
+                                    logEvent('select', country);
+                                    window.inputValue = country;
+                                });
+                                div.appendChild(item);
+                            });
+                    }, 100); // 100ms debounce
+                });
+
+                function closeAllLists() {
+                    document.querySelectorAll('.autocomplete-items').forEach(el => el.remove());
+                }
+
+                document.addEventListener('click', function() {
+                    closeAllLists();
+                });
+            </script>
+        </body>
+        </html>
+        """
+
+        # Write to file and use file:// URL (data: URLs don't work well with extension)
+        test_file = Path("/tmp/autocomplete_test.html")
+        test_file.write_text(test_html.strip())
+        file_url = f"file://{test_file}"
+        await bridge.navigate(tab_id, file_url, wait_until="load")
+        print("✓ Page loaded")
+
+        # Screenshot
+        screenshot = await bridge.screenshot(tab_id)
+        print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+
+        # Test 1: Fast typing (no delay) - may fail
+        print("\n--- Test 1: Fast typing (delay_ms=0) ---")
+        await bridge.click(tab_id, "#search")
+        await bridge.type_text(tab_id, "#search", "Ger", clear_first=True, delay_ms=0)
+        await asyncio.sleep(0.5)
+
+        fast_result = await bridge.evaluate(
+            tab_id, "(function() { return document.getElementById('search').value; })()"
+        )
+        fast_value = fast_result.get("result", "")
+        print(f"Value after fast typing: '{fast_value}'")
+
+        # Check events
+        events_result = await bridge.evaluate(
+            tab_id, "(function() { return window.inputEvents; })()"
+        )
+        print(f"Events logged: {events_result.get('result', [])}")
+
+        # Test 2: Slow typing (with delay) - should work
+        print("\n--- Test 2: Slow typing (delay_ms=100) ---")
+        await bridge.click(tab_id, "#search")
+        await bridge.type_text(tab_id, "#search", "United", clear_first=True, delay_ms=100)
+        await asyncio.sleep(0.5)
+
+        slow_result = await bridge.evaluate(
+            tab_id, "(function() { return document.getElementById('search').value; })()"
+        )
+        slow_value = slow_result.get("result", "")
+        print(f"Value after slow typing: '{slow_value}'")
+
+        # Check if dropdown appeared
+        dropdown_result = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.querySelectorAll("
+            "'.autocomplete-items div').length; })()",
+        )
+        dropdown_count = dropdown_result.get("result", 0)
+        print(f"Dropdown items: {dropdown_count}")
+
+        # Screenshot with dropdown
+        screenshot_dropdown = await bridge.screenshot(tab_id)
+        print(f"Screenshot with dropdown: {len(screenshot_dropdown.get('data', ''))} bytes")
+
+        # Results
+        print("\n--- Results ---")
+        if "United" in slow_value:
+            print("✓ PASS: Slow typing with delay_ms worked")
+        else:
+            print("✗ FAIL: Slow typing still didn't work")
+
+        if dropdown_count > 0:
+            print("✓ PASS: Autocomplete dropdown appeared")
+        else:
+            print("⚠ WARNING: No autocomplete dropdown")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_autocomplete())
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+"""
+Test #10: LinkedIn Huge DOM Tree
+
+Symptom: browser_snapshot() hangs forever
+Root Cause: 10k+ DOM nodes, accessibility tree has 50k+ nodes
+Detection: document.querySelectorAll('*').length > 5000
+Fix: Add timeout (10s default), truncate tree at 2000 nodes
+"""
+
+import asyncio
+import sys
+import time
+import base64
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "huge-dom-test"
+
+
+async def test_huge_dom():
+    """Test snapshot performance on huge DOM trees."""
+    print("=" * 70)
+    print("TEST #10: Huge DOM Tree (LinkedIn-style)")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Test 1: Small DOM (baseline)
+        print("\n--- Test 1: Small DOM (baseline) ---")
+        small_html = """
+        <!DOCTYPE html>
+        <html><body>
+            <h1>Small Page</h1>
+            <p>A few elements</p>
+            <button>Click me</button>
+        </body></html>
+        """
+        data_url = f"data:text/html;base64,{base64.b64encode(small_html.encode()).decode()}"
+        await bridge.navigate(tab_id, data_url, wait_until="load")
+
+        start = time.perf_counter()
+        snapshot = await bridge.snapshot(tab_id, timeout_s=5.0)
+        elapsed = time.perf_counter() - start
+        tree_len = len(snapshot.get("tree", ""))
+        print(f"Small DOM snapshot: {elapsed:.3f}s, {tree_len} chars")
+
+        # Test 2: Generate huge DOM
+        print("\n--- Test 2: Huge DOM (5000+ elements) ---")
+        huge_html = """
+        <!DOCTYPE html>
+        <html><body>
+        <h1>Huge DOM Test</h1>
+        <div id="container"></div>
+        <script>
+            const container = document.getElementById('container');
+            for (let i = 0; i < 5000; i++) {
+                const div = document.createElement('div');
+                div.className = 'item-' + i;
+                div.innerHTML = '<span>Item ' + i + '</span><button>Action</button>';
+                container.appendChild(div);
+            }
+        </script>
+        </body></html>
+        """
+        data_url = f"data:text/html;base64,{base64.b64encode(huge_html.encode()).decode()}"
+        await bridge.navigate(tab_id, data_url, wait_until="load")
+
+        # Count elements
+        count_result = await bridge.evaluate(
+            tab_id, "(function() { return document.querySelectorAll('*').length; })()"
+        )
+        elem_count = count_result.get("result", 0)
+        print(f"DOM elements: {elem_count}")
+
+        # Skip screenshot on huge DOM - it can timeout
+        # Instead verify page loaded by checking DOM
+        print("✓ Page verified (skipping screenshot on huge DOM)")
+
+        # Test snapshot with timeout
+        print("\n--- Testing snapshot with 10s timeout ---")
+        start = time.perf_counter()
+        try:
+            snapshot = await bridge.snapshot(tab_id, timeout_s=10.0)
+            elapsed = time.perf_counter() - start
+            tree_len = len(snapshot.get("tree", ""))
+            truncated = "(truncated)" in snapshot.get("tree", "")
+            print(f"✓ Huge DOM snapshot: {elapsed:.3f}s, {tree_len} chars, truncated={truncated}")
+
+            if elapsed < 5.0:
+                print("✓ PASS: Snapshot completed quickly")
+            else:
+                print(f"⚠ WARNING: Snapshot took {elapsed:.1f}s")
+
+            if truncated:
+                print("✓ PASS: Tree was truncated to prevent hang")
+            else:
+                print("⚠ WARNING: Tree not truncated (may need adjustment)")
+
+        except asyncio.TimeoutError:
+            print("✗ FAIL: Snapshot timed out (this shouldn't happen)")
+
+        # Test 3: Real LinkedIn
+        print("\n--- Test 3: Real LinkedIn Feed ---")
+        await bridge.navigate(
+            tab_id, "https://www.linkedin.com/feed", wait_until="load", timeout_ms=30000
+        )
+        await asyncio.sleep(2)
+
+        count_result = await bridge.evaluate(
+            tab_id, "(function() { return document.querySelectorAll('*').length; })()"
+        )
+        elem_count = count_result.get("result", 0)
+        print(f"LinkedIn DOM elements: {elem_count}")
+
+        start = time.perf_counter()
+        try:
+            snapshot = await bridge.snapshot(tab_id, timeout_s=15.0)
+            elapsed = time.perf_counter() - start
+            tree_len = len(snapshot.get("tree", ""))
+            truncated = "(truncated)" in snapshot.get("tree", "")
+            print(f"LinkedIn snapshot: {elapsed:.3f}s, {tree_len} chars, truncated={truncated}")
+
+            if elapsed < 5.0:
+                print("✓ PASS: LinkedIn snapshot fast enough")
+            elif elapsed < 15.0:
+                print("⚠ WARNING: LinkedIn snapshot slow but within timeout")
+            else:
+                print("✗ FAIL: LinkedIn snapshot too slow")
+
+        except asyncio.TimeoutError:
+            print("✗ FAIL: LinkedIn snapshot timed out")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_huge_dom())
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+"""
+Test #13: SPA Navigation Events
+
+Symptom: wait_until="load" fires before content ready
+Root Cause: SPA uses client-side routing, no full page load
+Detection: URL changes but load event already fired
+Fix: Use wait_until="networkidle" or wait_for_selector
+"""
+
+import asyncio
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "spa-nav-test"
+
+
+async def test_spa_navigation():
+    """Test navigation timing on SPA pages."""
+    print("=" * 70)
+    print("TEST #13: SPA Navigation Events")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+        else:
+            print("✗ Extension not connected")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Create a test SPA
+        spa_html = """
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <title>SPA Test</title>
+            <style>
+                nav a { margin-right: 10px; }
+                .page { padding: 20px; border: 1px solid #ccc; margin-top: 10px; }
+            </style>
+        </head>
+        <body>
+            <nav>
+                <a href="#home" onclick="navigate('home')">Home</a>
+                <a href="#about" onclick="navigate('about')">About</a>
+                <a href="#contact" onclick="navigate('contact')">Contact</a>
+            </nav>
+            <div id="app" class="page">
+                <h1>Loading...</h1>
+            </div>
+            <script>
+                // Simulate SPA routing
+                let currentPage = '';
+
+                async function navigate(page) {
+                    event.preventDefault();
+                    currentPage = page;
+
+                    // Show loading state
+                    document.getElementById('app').innerHTML = '<h1>Loading...</h1>';
+
+                    // Simulate async content loading (like real SPAs)
+                    await new Promise(r => setTimeout(r, 500));
+
+                    // Render content
+                    const content = {
+                        home: '<h1>Home Page</h1><p>Welcome!</p>'
+                            + '<button id="home-btn">Home Action</button>',
+                        about: '<h1>About Page</h1><p>Simulated SPA.</p>'
+                            + '<button id="about-btn">About Action</button>',
+                        contact: '<h1>Contact Page</h1>'
+                            + '<p>Contact us at test@example.com</p>'
+                            + '<button id="contact-btn">Contact Action</button>'
+                    };
+
+                    document.getElementById('app').innerHTML = content[page] || '<h1>404</h1>';
+                    window.location.hash = page;
+                }
+
+                // Initial load with delay (simulates SPA hydration)
+                setTimeout(() => {
+                    navigate('home');
+                }, 1000);
+
+                // Track for testing
+                window.pageLoads = [];
+                window.addEventListener('hashchange', () => {
+                    window.pageLoads.push(window.location.hash);
+                });
+            </script>
+        </body>
+        </html>
+        """
+
+        # Write to file and use file:// URL (data: URLs don't work well with extension)
+        test_file = Path("/tmp/spa_test.html")
+        test_file.write_text(spa_html.strip())
+        file_url = f"file://{test_file}"
+
+        # Test 1: wait_until="load" - may fire before content ready
+        print("\n--- Test 1: wait_until='load' ---")
+        start = time.perf_counter()
+        await bridge.navigate(tab_id, file_url, wait_until="load")
+        elapsed = time.perf_counter() - start
+        print(f"Navigation completed in {elapsed:.3f}s")
+
+        # Check content immediately
+        content = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.getElementById('app').innerText; })()",
+        )
+        print(f"Content immediately after load: '{content.get('result', '')}'")
+
+        # Screenshot
+        screenshot = await bridge.screenshot(tab_id)
+        print(f"Screenshot: {len(screenshot.get('data', ''))} bytes")
+
+        # Wait for content
+        print("\n--- Waiting for content to hydrate ---")
+        await bridge.wait_for_selector(tab_id, "#home-btn", timeout_ms=5000)
+        print("✓ Content loaded")
+
+        # Check content after wait
+        content_after = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.getElementById('app').innerText; })()",
+        )
+        print(f"Content after wait: '{content_after.get('result', '')}'")
+
+        # Test 2: SPA navigation (no full page load)
+        print("\n--- Test 2: SPA client-side navigation ---")
+
+        # Click "About" link
+        await bridge.click(tab_id, 'a[href="#about"]')
+        await asyncio.sleep(1)
+
+        # Check if content changed
+        about_content = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.getElementById('app').innerText; })()",
+        )
+        print(f"Content after SPA nav: '{about_content.get('result', '')}'")
+
+        if "About Page" in about_content.get("result", ""):
+            print("✓ PASS: SPA navigation worked")
+        else:
+            print("✗ FAIL: SPA navigation didn't update content")
+
+        # Test 3: wait_until="networkidle"
+        print("\n--- Test 3: wait_until='networkidle' ---")
+        await bridge.navigate(tab_id, file_url, wait_until="networkidle", timeout_ms=10000)
+
+        # Check content immediately
+        content_networkidle = await bridge.evaluate(
+            tab_id,
+            "(function() { return document.getElementById('app').innerText; })()",
+        )
+        print(f"Content after networkidle: '{content_networkidle.get('result', '')}'")
+
+        if "Home Page" in content_networkidle.get("result", ""):
+            print("✓ PASS: networkidle waited for content")
+        else:
+            print("⚠ WARNING: networkidle didn't wait long enough")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(test_spa_navigation())
@@ -0,0 +1,267 @@
+#!/usr/bin/env python
+"""
+Test #15: Screenshot Functionality
+
+Tests browser_screenshot across multiple scenarios:
+- Basic viewport screenshot
+- Full-page screenshot
+- Selector-based screenshot
+- Screenshot on complex DOM
+- Timeout handling
+
+Category: screenshot
+"""
+
+import asyncio
+import base64
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+CONTEXT_NAME = "screenshot-test"
+
+SIMPLE_HTML = """<!DOCTYPE html>
+<html>
+<head><style>
+  body { margin: 0; background: #fff; font-family: sans-serif; }
+  h1 { color: #333; padding: 20px; }
+  .box { width: 200px; height: 100px; background: #4a90e2; margin: 20px; }
+  .long-content { height: 2000px; background: linear-gradient(blue, red); }
+</style></head>
+<body>
+  <h1 id="title">Screenshot Test Page</h1>
+  <div class="box" id="target-box">Target Box</div>
+  <div class="long-content"></div>
+</body>
+</html>"""
+
+
+def check_png(data: str) -> bool:
+    """Verify that base64 data decodes to a valid PNG."""
+    try:
+        raw = base64.b64decode(data)
+        return raw[:8] == b"\x89PNG\r\n\x1a\n"
+    except Exception:
+        return False
+
+
+async def test_basic_screenshot(bridge: BeelineBridge, tab_id: int, data_url: str):
+    print("\n--- Test 1: Basic Viewport Screenshot ---")
+    await bridge.navigate(tab_id, data_url, wait_until="load")
+    await asyncio.sleep(0.5)
+
+    start = time.perf_counter()
+    result = await bridge.screenshot(tab_id)
+    elapsed = time.perf_counter() - start
+
+    ok = result.get("ok")
+    data = result.get("data", "")
+    mime = result.get("mimeType", "")
+
+    print(f"  ok={ok}, mimeType={mime}, elapsed={elapsed:.3f}s")
+    print(f"  data length: {len(data)} chars")
+
+    if ok and data:
+        valid_png = check_png(data)
+        print(f"  valid PNG: {valid_png}")
+        if valid_png:
+            raw = base64.b64decode(data)
+            print(f"  PNG size: {len(raw)} bytes")
+            print("  ✓ PASS: Basic screenshot works")
+            return True
+        else:
+            print("  ✗ FAIL: Data is not a valid PNG")
+    else:
+        print(f"  ✗ FAIL: {result.get('error', 'no data')}")
+    return False
+
+
+async def test_full_page_screenshot(bridge: BeelineBridge, tab_id: int, data_url: str):
+    print("\n--- Test 2: Full Page Screenshot ---")
+    await bridge.navigate(tab_id, data_url, wait_until="load")
+    await asyncio.sleep(0.5)
+
+    viewport_result = await bridge.screenshot(tab_id, full_page=False)
+    full_result = await bridge.screenshot(tab_id, full_page=True)
+
+    v_data = viewport_result.get("data", "")
+    f_data = full_result.get("data", "")
+
+    if not v_data or not f_data:
+        print(f"  ✗ FAIL: viewport ok={viewport_result.get('ok')}, full ok={full_result.get('ok')}")
+        return False
+
+    v_size = len(base64.b64decode(v_data))
+    f_size = len(base64.b64decode(f_data))
+    print(f"  Viewport PNG: {v_size} bytes")
+    print(f"  Full page PNG: {f_size} bytes")
+
+    if f_size > v_size:
+        print("  ✓ PASS: Full page larger than viewport")
+        return True
+    else:
+        print("  ✗ FAIL: Full page not larger than viewport (may not capture long pages)")
+        return False
+
+
+async def test_selector_screenshot(bridge: BeelineBridge, tab_id: int, data_url: str):
+    print("\n--- Test 3: Selector Screenshot ---")
+    await bridge.navigate(tab_id, data_url, wait_until="load")
+    await asyncio.sleep(0.5)
+
+    # selector param exists in signature but may not be implemented
+    result = await bridge.screenshot(tab_id, selector="#target-box")
+
+    ok = result.get("ok")
+    data = result.get("data", "")
+
+    if ok and data:
+        # If implemented, the box screenshot should be smaller than a full viewport screenshot
+        full_result = await bridge.screenshot(tab_id)
+        full_data = full_result.get("data", "")
+
+        if full_data:
+            sel_size = len(base64.b64decode(data))
+            full_size = len(base64.b64decode(full_data))
+            print(f"  Selector PNG: {sel_size} bytes")
+            print(f"  Full page PNG: {full_size} bytes")
+            if sel_size < full_size:
+                print("  ✓ PASS: Selector screenshot smaller than full page")
+                return True
+            else:
+                print("  ⚠ WARNING: Selector screenshot not smaller (may be full page)")
+                return False
+    else:
+        print(
+            "  ⚠ NOT IMPLEMENTED: selector param ignored"
+            f" (returns full page) - error={result.get('error')}"
+        )
+        print("  NOTE: selector parameter exists in signature but is not used in implementation")
+        return False
+
+
+async def test_screenshot_url_metadata(bridge: BeelineBridge, tab_id: int):
+    print("\n--- Test 4: Screenshot URL Metadata ---")
+    await bridge.navigate(tab_id, "https://example.com", wait_until="load")
+    await asyncio.sleep(1)
+
+    result = await bridge.screenshot(tab_id)
+    url = result.get("url", "")
+    tab = result.get("tabId")
+
+    print(f"  url={url!r}, tabId={tab}")
+
+    if "example.com" in url:
+        print("  ✓ PASS: URL metadata captured correctly")
+        return True
+    else:
+        print(f"  ✗ FAIL: Expected example.com in URL, got {url!r}")
+        return False
+
+
+async def test_screenshot_timeout(bridge: BeelineBridge, tab_id: int, data_url: str):
+    print("\n--- Test 5: Timeout Handling ---")
+    await bridge.navigate(tab_id, data_url, wait_until="load")
+
+    # Very short timeout - likely still completes since simple page
+    start = time.perf_counter()
+    result = await bridge.screenshot(tab_id, timeout_s=0.001)
+    elapsed = time.perf_counter() - start
+
+    if not result.get("ok"):
+        err = result.get("error", "")
+        if "timed out" in err or "cancelled" in err:
+            print(f"  ✓ PASS: Timeout handled gracefully: {err!r}")
+            return True
+        else:
+            print(f"  ⚠ Fast enough to beat timeout: {err!r} in {elapsed:.3f}s")
+            return True  # Not a failure, just fast
+    else:
+        print(
+            f"  ⚠ Screenshot completed before timeout ({elapsed:.3f}s) - too fast to test timeout"
+        )
+        return True  # Still ok, just very fast
+
+
+async def test_screenshot_complex_site(bridge: BeelineBridge, tab_id: int):
+    print("\n--- Test 6: Complex Site (example.com) ---")
+    await bridge.navigate(tab_id, "https://example.com", wait_until="load")
+    await asyncio.sleep(1)
+
+    start = time.perf_counter()
+    result = await bridge.screenshot(tab_id)
+    elapsed = time.perf_counter() - start
+
+    ok = result.get("ok")
+    data = result.get("data", "")
+
+    print(f"  ok={ok}, elapsed={elapsed:.3f}s, data_len={len(data)}")
+    if ok and check_png(data):
+        print("  ✓ PASS: Screenshot on real site works")
+        return True
+    else:
+        print(f"  ✗ FAIL: {result.get('error', 'bad data')}")
+        return False
+
+
+async def main():
+    print("=" * 70)
+    print("TEST #15: Screenshot Functionality")
+    print("=" * 70)
+
+    bridge = BeelineBridge()
+
+    try:
+        await bridge.start()
+
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+            print(f"Waiting for extension... ({i + 1}/10)")
+        else:
+            print("✗ Extension not connected. Ensure Chrome with Beeline extension is running.")
+            return
+
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        data_url = f"data:text/html;base64,{base64.b64encode(SIMPLE_HTML.encode()).decode()}"
+
+        results = {
+            "basic": await test_basic_screenshot(bridge, tab_id, data_url),
+            "full_page": await test_full_page_screenshot(bridge, tab_id, data_url),
+            "selector": await test_selector_screenshot(bridge, tab_id, data_url),
+            "metadata": await test_screenshot_url_metadata(bridge, tab_id),
+            "timeout": await test_screenshot_timeout(bridge, tab_id, data_url),
+            "complex_site": await test_screenshot_complex_site(bridge, tab_id),
+        }
+
+        print("\n" + "=" * 70)
+        print("SUMMARY")
+        print("=" * 70)
+        for name, passed in results.items():
+            status = "✓ PASS" if passed else "✗ FAIL"
+            print(f"  {status}: {name}")
+
+        passed_count = sum(1 for v in results.values() if v)
+        total = len(results)
+        print(f"\n  {passed_count}/{total} tests passed")
+
+        await bridge.destroy_context(group_id)
+        print("\n✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+        print("✓ Bridge stopped")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,333 @@
+#!/usr/bin/env python
+"""
+Browser Edge Case Test Template
+
+This script provides a template for testing and debugging browser tool failures
+on specific websites. Use this to reproduce, isolate, and verify fixes.
+
+Usage:
+    1. Copy this file: cp test_case.py test_#[number]_[site].py
+    2. Fill in the CONFIG section with your test details
+    3. Run: uv run python test_#[number]_[site].py
+
+Example:
+    uv run python test_01_linkedin_scroll.py
+"""
+
+import asyncio
+import sys
+import time
+from pathlib import Path
+
+# Add tools to path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "tools" / "src"))
+
+from gcu.browser.bridge import BeelineBridge
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# CONFIG: Fill in these values for your test case
+# ═══════════════════════════════════════════════════════════════════════════════
+
+TEST_CASE = {
+    "number": 1,
+    "name": "LinkedIn Nested Scroll Container",
+    "site": "https://www.linkedin.com/feed",
+    "simple_site": "https://example.com",
+    "category": "scroll",  # scroll, click, input, snapshot, navigation
+    "symptom": "scroll() returns success but page doesn't move",
+}
+
+BRIDGE_PORT = 9229
+CONTEXT_NAME = "edge-case-test"
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# TEST FUNCTIONS
+# ═══════════════════════════════════════════════════════════════════════════════
+
+
+async def test_simple_site(bridge: BeelineBridge, tab_id: int) -> dict:
+    """Test that the tool works on a simple site (baseline)."""
+    print("\n--- Baseline Test (Simple Site) ---")
+
+    await bridge.navigate(tab_id, TEST_CASE["simple_site"], wait_until="load")
+    await asyncio.sleep(1)
+
+    # Adjust this based on category
+    if TEST_CASE["category"] == "scroll":
+        result = await bridge.scroll(tab_id, "down", 100)
+        print(f"  Scroll result: {result}")
+        return result
+    elif TEST_CASE["category"] == "click":
+        # Add click test
+        pass
+    elif TEST_CASE["category"] == "snapshot":
+        result = await bridge.snapshot(tab_id, timeout_s=5.0)
+        print(f"  Snapshot length: {len(result.get('tree', ''))}")
+        return result
+
+    return {"ok": True}
+
+
+async def test_problematic_site(bridge: BeelineBridge, tab_id: int) -> dict:
+    """Test the tool on the problematic site."""
+    print("\n--- Problem Site Test ---")
+
+    await bridge.navigate(tab_id, TEST_CASE["site"], wait_until="load", timeout_ms=30000)
+    await asyncio.sleep(2)
+
+    # Adjust this based on category
+    if TEST_CASE["category"] == "scroll":
+        # Get scroll positions before
+        before = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const results = { window: { y: window.scrollY } };
+                document.querySelectorAll('*').forEach((el, i) => {
+                    const style = getComputedStyle(el);
+                    if ((style.overflowY === 'scroll' || style.overflowY === 'auto') &&
+                        el.scrollHeight > el.clientHeight) {
+                        results['el_' + i] = {
+                            tag: el.tagName,
+                            scrollTop: el.scrollTop,
+                            class: el.className.substring(0, 30)
+                        };
+                    }
+                });
+                return results;
+            })();
+        """,
+        )
+        print(f"  Before scroll: {before.get('result', {})}")
+
+        # Try to scroll
+        result = await bridge.scroll(tab_id, "down", 500)
+        print(f"  Scroll result: {result}")
+
+        await asyncio.sleep(1)
+
+        # Get scroll positions after
+        after = await bridge.evaluate(
+            tab_id,
+            """
+            (function() {
+                const results = { window: { y: window.scrollY } };
+                document.querySelectorAll('*').forEach((el, i) => {
+                    const style = getComputedStyle(el);
+                    if ((style.overflowY === 'scroll' || style.overflowY === 'auto') &&
+                        el.scrollHeight > el.clientHeight) {
+                        results['el_' + i] = {
+                            tag: el.tagName,
+                            scrollTop: el.scrollTop,
+                            class: el.className.substring(0, 30)
+                        };
+                    }
+                });
+                return results;
+            })();
+        """,
+        )
+        print(f"  After scroll: {after.get('result', {})}")
+
+        # Check if anything changed
+        before_data = before.get("result", {}) or {}
+        after_data = after.get("result", {}) or {}
+
+        changed = False
+        for key in after_data:
+            if key in before_data:
+                b_val = (
+                    before_data[key].get("scrollTop", 0)
+                    if isinstance(before_data[key], dict)
+                    else 0
+                )
+                a_val = (
+                    after_data[key].get("scrollTop", 0) if isinstance(after_data[key], dict) else 0
+                )
+                if a_val != b_val:
+                    print(f"  ✓ CHANGE DETECTED: {key} scrolled from {b_val} to {a_val}")
+                    changed = True
+
+        if not changed:
+            print("  ✗ NO CHANGE: Scroll did not affect any container")
+
+        return {"ok": changed, "scroll_result": result}
+
+    elif TEST_CASE["category"] == "snapshot":
+        start = time.perf_counter()
+        try:
+            result = await bridge.snapshot(tab_id, timeout_s=15.0)
+            elapsed = time.perf_counter() - start
+            tree_len = len(result.get("tree", ""))
+            print(f"  Snapshot completed in {elapsed:.2f}s, {tree_len} chars")
+            return {"ok": True, "elapsed": elapsed, "tree_length": tree_len}
+        except asyncio.TimeoutError:
+            print("  ✗ SNAPSHOT TIMED OUT")
+            return {"ok": False, "error": "timeout"}
+
+    return {"ok": True}
+
+
+async def detect_root_cause(bridge: BeelineBridge, tab_id: int) -> dict:
+    """Run detection scripts to identify the root cause."""
+    print("\n--- Root Cause Detection ---")
+
+    detections = {}
+
+    # Detection 1: Nested scrollable containers
+    scroll_check = await bridge.evaluate(
+        tab_id,
+        """
+        (function() {
+            const candidates = [];
+            document.querySelectorAll('*').forEach(el => {
+                const style = getComputedStyle(el);
+                if (style.overflow.includes('scroll') || style.overflow.includes('auto')) {
+                    const rect = el.getBoundingClientRect();
+                    if (rect.width > 100 && rect.height > 100) {
+                        candidates.push({
+                            tag: el.tagName,
+                            area: rect.width * rect.height,
+                            class: el.className.substring(0, 30)
+                        });
+                    }
+                }
+            });
+            candidates.sort((a, b) => b.area - a.area);
+            return {
+                count: candidates.length,
+                largest: candidates[0]
+            };
+        })();
+    """,
+    )
+    detections["nested_scroll"] = scroll_check.get("result", {})
+    print(f"  Nested scroll containers: {detections['nested_scroll']}")
+
+    # Detection 2: Shadow DOM
+    shadow_check = await bridge.evaluate(
+        tab_id,
+        """
+        (function() {
+            const withShadow = [];
+            document.querySelectorAll('*').forEach(el => {
+                if (el.shadowRoot) {
+                    withShadow.push(el.tagName);
+                }
+            });
+            return { count: withShadow.length, elements: withShadow.slice(0, 5) };
+        })();
+    """,
+    )
+    detections["shadow_dom"] = shadow_check.get("result", {})
+    print(f"  Shadow DOM: {detections['shadow_dom']}")
+
+    # Detection 3: iframes
+    iframe_check = await bridge.evaluate(
+        tab_id,
+        """
+        (function() {
+            const iframes = document.querySelectorAll('iframe');
+            return { count: iframes.length };
+        })();
+    """,
+    )
+    detections["iframes"] = iframe_check.get("result", {})
+    print(f"  iframes: {detections['iframes']}")
+
+    # Detection 4: DOM size
+    dom_check = await bridge.evaluate(
+        tab_id,
+        """
+        (function() {
+            return {
+                elements: document.querySelectorAll('*').length,
+                body_children: document.body.children.length
+            };
+        })();
+    """,
+    )
+    detections["dom_size"] = dom_check.get("result", {})
+    print(f"  DOM size: {detections['dom_size']}")
+
+    # Detection 5: Framework detection
+    framework_check = await bridge.evaluate(
+        tab_id,
+        """
+        (function() {
+            return {
+                react: !!document.querySelector('[data-reactroot], [data-reactid]'),
+                vue: !!document.querySelector('[data-v-]'),
+                angular: !!document.querySelector('[ng-app], [ng-version]')
+            };
+        })();
+    """,
+    )
+    detections["frameworks"] = framework_check.get("result", {})
+    print(f"  Frameworks: {detections['frameworks']}")
+
+    return detections
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# MAIN
+# ═══════════════════════════════════════════════════════════════════════════════
+
+
+async def main():
+    print("=" * 70)
+    print(f"EDGE CASE TEST #{TEST_CASE['number']}: {TEST_CASE['name']}")
+    print("=" * 70)
+    print(f"Site: {TEST_CASE['site']}")
+    print(f"Category: {TEST_CASE['category']}")
+    print(f"Symptom: {TEST_CASE['symptom']}")
+
+    bridge = BeelineBridge()
+
+    try:
+        print("\n--- Starting Bridge ---")
+        await bridge.start()
+
+        # Wait for extension connection
+        for i in range(10):
+            await asyncio.sleep(1)
+            if bridge.is_connected:
+                print("✓ Extension connected!")
+                break
+            print(f"Waiting for extension... ({i + 1}/10)")
+        else:
+            print("✗ Extension not connected. Ensure Chrome with Beeline extension is running.")
+            return
+
+        # Create browser context
+        context = await bridge.create_context(CONTEXT_NAME)
+        tab_id = context.get("tabId")
+        group_id = context.get("groupId")
+        print(f"✓ Created tab: {tab_id}")
+
+        # Run tests
+        baseline_result = await test_simple_site(bridge, tab_id)
+        problem_result = await test_problematic_site(bridge, tab_id)
+        detections = await detect_root_cause(bridge, tab_id)
+
+        # Summary
+        print("\n" + "=" * 70)
+        print("SUMMARY")
+        print("=" * 70)
+        print(f"Baseline test: {'✓ PASS' if baseline_result.get('ok') else '✗ FAIL'}")
+        print(f"Problem test: {'✓ PASS' if problem_result.get('ok') else '✗ FAIL'}")
+        print(f"Root cause indicators: {list(k for k, v in detections.items() if v)}")
+
+        # Cleanup
+        print("\n--- Cleanup ---")
+        await bridge.destroy_context(group_id)
+        print("✓ Context destroyed")
+
+    finally:
+        await bridge.stop()
+        print("✓ Bridge stopped")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -1,399 +0,0 @@
---
-name: hive-concepts
-description: Core concepts for goal-driven agents - architecture, node types (event_loop, function), tool discovery, and workflow overview. Use when starting agent development or need to understand agent fundamentals.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "2.0"
-  type: foundational
-  part_of: hive
---
-
-# Building Agents - Core Concepts
-
-Foundational knowledge for building goal-driven agents as Python packages.
-
-## Architecture: Python Services (Not JSON Configs)
-
-Agents are built as Python packages:
-
-```
-exports/my_agent/
-├── __init__.py          # Package exports
-├── __main__.py          # CLI (run, info, validate, shell)
-├── agent.py             # Graph construction (goal, edges, agent class)
-├── nodes/__init__.py    # Node definitions (NodeSpec)
-├── config.py            # Runtime config
-└── README.md            # Documentation
-```
-
-**Key Principle: Agent is visible and editable during build**
-
- Files created immediately as components are approved
- User can watch files grow in their editor
- No session state - just direct file writes
- No "export" step - agent is ready when build completes
-
-## Core Concepts
-
-### Goal
-
-Success criteria and constraints (written to agent.py)
-
-```python
-goal = Goal(
-    id="research-goal",
-    name="Technical Research Agent",
-    description="Research technical topics thoroughly",
-    success_criteria=[
-        SuccessCriterion(
-            id="completeness",
-            description="Cover all aspects of topic",
-            metric="coverage_score",
-            target=">=0.9",
-            weight=0.4,
-        ),
-        # 3-5 success criteria total
-    ],
-    constraints=[
-        Constraint(
-            id="accuracy",
-            description="All information must be verified",
-            constraint_type="hard",
-            category="quality",
-        ),
-        # 1-5 constraints total
-    ],
-)
-```
-
-### Node
-
-Unit of work (written to nodes/__init__.py)
-
-**Node Types:**
-
- `event_loop` — Multi-turn streaming loop with tool execution and judge-based evaluation. Works with or without tools.
- `function` — Deterministic Python operations. No LLM involved.
-
-```python
-search_node = NodeSpec(
-    id="search-web",
-    name="Search Web",
-    description="Search for information and extract results",
-    node_type="event_loop",
-    input_keys=["query"],
-    output_keys=["search_results"],
-    system_prompt="Search the web for: {query}. Use the web_search tool to find results, then call set_output to store them.",
-    tools=["web_search"],
-)
-```
-
-**NodeSpec Fields for Event Loop Nodes:**
-
-| Field | Default | Description |
-|-------|---------|-------------|
-| `client_facing` | `False` | If True, streams output to user and blocks for input between turns |
-| `nullable_output_keys` | `[]` | Output keys that may remain unset (for mutually exclusive outputs) |
-| `max_node_visits` | `1` | Max times this node executes per run. Set >1 for feedback loop targets |
-
-### Edge
-
-Connection between nodes (written to agent.py)
-
-**Edge Conditions:**
-
- `on_success` — Proceed if node succeeds (most common)
- `on_failure` — Handle errors
- `always` — Always proceed
- `conditional` — Based on expression evaluating node output
-
-**Edge Priority:**
-
-Priority controls evaluation order when multiple edges leave the same node. Higher priority edges are evaluated first. Use negative priority for feedback edges (edges that loop back to earlier nodes).
-
-```python
-# Forward edge (evaluated first)
-EdgeSpec(
-    id="review-to-campaign",
-    source="review",
-    target="campaign-builder",
-    condition=EdgeCondition.CONDITIONAL,
-    condition_expr="output.get('approved_contacts') is not None",
-    priority=1,
-)
-
-# Feedback edge (evaluated after forward edges)
-EdgeSpec(
-    id="review-feedback",
-    source="review",
-    target="extractor",
-    condition=EdgeCondition.CONDITIONAL,
-    condition_expr="output.get('redo_extraction') is not None",
-    priority=-1,
-)
-```
-
-### Client-Facing Nodes
-
-For multi-turn conversations with the user, set `client_facing=True` on a node. The node will:
- Stream its LLM output directly to the end user
- Block for user input between conversational turns
- Resume when new input is injected via `inject_event()`
-
-```python
-intake_node = NodeSpec(
-    id="intake",
-    name="Intake",
-    description="Gather requirements from the user",
-    node_type="event_loop",
-    client_facing=True,
-    input_keys=[],
-    output_keys=["repo_url", "project_url"],
-    system_prompt="You are the intake agent. Ask the user for the repo URL and project URL.",
-)
-```
-
-> **Legacy Note:** The old `pause_nodes` / `entry_points` pattern still works but `client_facing=True` is preferred for new agents.
-
-**STEP 1 / STEP 2 Prompt Pattern:** For client-facing nodes, structure the system prompt with two explicit phases:
-
-```python
-system_prompt="""\
-**STEP 1 — Respond to the user (text only, NO tool calls):**
-[Present information, ask questions, etc.]
-
-**STEP 2 — After the user responds, call set_output:**
-[Call set_output with the structured outputs]
-"""
-```
-
-This prevents the LLM from calling `set_output` prematurely before the user has had a chance to respond.
-
-### Node Design: Fewer, Richer Nodes
-
-Prefer fewer nodes that do more work over many thin single-purpose nodes:
-
- **Bad**: 8 thin nodes (parse query → search → fetch → evaluate → synthesize → write → check → save)
- **Good**: 4 rich nodes (intake → research → review → report)
-
-Why: Each node boundary requires serializing outputs and passing context. Fewer nodes means the LLM retains full context of its work within the node. A research node that searches, fetches, and analyzes keeps all the source material in its conversation history.
-
-### nullable_output_keys for Cross-Edge Inputs
-
-When a node receives inputs that only arrive on certain edges (e.g., `feedback` only comes from a review → research feedback loop, not from intake → research), mark those keys as `nullable_output_keys`:
-
-```python
-research_node = NodeSpec(
-    id="research",
-    input_keys=["research_brief", "feedback"],
-    nullable_output_keys=["feedback"],  # Not present on first visit
-    max_node_visits=3,
-    ...
-)
-```
-
-## Event Loop Architecture Concepts
-
-### How EventLoopNode Works
-
-An event loop node runs a multi-turn loop:
-1. LLM receives system prompt + conversation history
-2. LLM responds (text and/or tool calls)
-3. Tool calls are executed, results added to conversation
-4. Judge evaluates: ACCEPT (exit loop), RETRY (loop again), or ESCALATE
-5. Repeat until judge ACCEPTs or max_iterations reached
-
-### EventLoopNode Runtime
-
-EventLoopNodes are **auto-created** by `GraphExecutor` at runtime. You do NOT need to manually register them. Both `GraphExecutor` (direct) and `AgentRuntime` / `create_agent_runtime()` handle event_loop nodes automatically.
-
-```python
-# Direct execution — executor auto-creates EventLoopNodes
-from framework.graph.executor import GraphExecutor
-from framework.runtime.core import Runtime
-
-runtime = Runtime(storage_path)
-executor = GraphExecutor(
-    runtime=runtime,
-    llm=llm,
-    tools=tools,
-    tool_executor=tool_executor,
-    storage_path=storage_path,
-)
-result = await executor.execute(graph=graph, goal=goal, input_data=input_data)
-
-# TUI execution — AgentRuntime also works
-from framework.runtime.agent_runtime import create_agent_runtime
-runtime = create_agent_runtime(
-    graph=graph, goal=goal, storage_path=storage_path,
-    entry_points=[...], llm=llm, tools=tools, tool_executor=tool_executor,
-)
-```
-
-### set_output
-
-Nodes produce structured outputs by calling `set_output(key, value)` — a synthetic tool injected by the framework. When the LLM calls `set_output`, the value is stored in the output accumulator and made available to downstream nodes via shared memory.
-
-`set_output` is NOT a real tool — it is excluded from `real_tool_results`. For client-facing nodes, this means a turn where the LLM only calls `set_output` (no other tools) is treated as a conversational boundary and will block for user input.
-
-### JudgeProtocol
-
-**The judge is the SOLE mechanism for acceptance decisions.** Do not add ad-hoc framework gating, output rollback, or premature rejection logic. If the LLM calls `set_output` too early, fix it with better prompts or a custom judge — not framework-level guards.
-
-The judge controls when a node's loop exits:
- **Implicit judge** (default, no judge configured): ACCEPTs when the LLM finishes with no tool calls and all required output keys are set
- **SchemaJudge**: Validates outputs against a Pydantic model
- **Custom judges**: Implement `evaluate(context) -> JudgeVerdict`
-
-### LoopConfig
-
-Controls loop behavior:
- `max_iterations` (default 50) — prevents infinite loops
- `max_tool_calls_per_turn` (default 10) — limits tool calls per LLM response
- `tool_call_overflow_margin` (default 0.5) — wiggle room before discarding extra tool calls (50% means hard cutoff at 150% of limit)
- `stall_detection_threshold` (default 3) — detects repeated identical responses
- `max_history_tokens` (default 32000) — triggers conversation compaction
-
-### Data Tools (Spillover Management)
-
-When tool results exceed the context window, the framework automatically saves them to a spillover directory and truncates with a hint. Nodes that produce or consume large data should include the data tools:
-
- `save_data(filename, data)` — Write data to a file in the data directory
- `load_data(filename, offset=0, limit=50)` — Read data with line-based pagination
- `list_data_files()` — List available data files
- `serve_file_to_user(filename, label="")` — Get a clickable file:// URI for the user
-
-Note: `data_dir` is a framework-injected context parameter — the LLM never sees or passes it. `GraphExecutor.execute()` sets it per-execution via `contextvars`, so data tools and spillover always share the same session-scoped directory.
-
-These are real MCP tools (not synthetic). Add them to nodes that handle large tool results:
-
-```python
-research_node = NodeSpec(
-    ...
-    tools=["web_search", "web_scrape", "load_data", "save_data", "list_data_files"],
-)
-```
-
-### Fan-Out / Fan-In
-
-Multiple ON_SUCCESS edges from the same source create parallel execution. All branches run concurrently via `asyncio.gather()`. Parallel event_loop nodes must have disjoint `output_keys`.
-
-### max_node_visits
-
-Controls how many times a node can execute in one graph run. Default is 1. Set higher for nodes that are targets of feedback edges (review-reject loops). Set 0 for unlimited (guarded by max_steps).
-
-## Tool Discovery & Validation
-
-**CRITICAL:** Before adding a node with tools, you MUST verify the tools exist.
-
-Tools are provided by MCP servers. Never assume a tool exists - always discover dynamically.
-
-### Step 1: Register MCP Server (if not already done)
-
-```python
-mcp__agent-builder__add_mcp_server(
-    name="tools",
-    transport="stdio",
-    command="python",
-    args='["mcp_server.py", "--stdio"]',
-    cwd="../tools"
-)
-```
-
-### Step 2: Discover Available Tools
-
-```python
-# List all tools from all registered servers
-mcp__agent-builder__list_mcp_tools()
-
-# Or list tools from a specific server
-mcp__agent-builder__list_mcp_tools(server_name="tools")
-```
-
-### Step 3: Validate Before Adding Nodes
-
-Before writing a node with `tools=[...]`:
-
-1. Call `list_mcp_tools()` to get available tools
-2. Check each tool in your node exists in the response
-3. If a tool doesn't exist:
-   - **DO NOT proceed** with the node
-   - Inform the user: "The tool 'X' is not available. Available tools are: ..."
-   - Ask if they want to use an alternative or proceed without the tool
-
-### Tool Validation Anti-Patterns
-
- **Never assume a tool exists** - always call `list_mcp_tools()` first
- **Never write a node with unverified tools** - validate before writing
- **Never silently drop tools** - if a tool doesn't exist, inform the user
- **Never guess tool names** - use exact names from discovery response
-
-## Workflow Overview: Incremental File Construction
-
-```
-1. CREATE PACKAGE → mkdir + write skeletons
-2. DEFINE GOAL → Write to agent.py + config.py
-3. FOR EACH NODE:
-   - Propose design (event_loop for LLM work, function for deterministic)
-   - User approves
-   - Write to nodes/__init__.py IMMEDIATELY
-   - (Optional) Validate with test_node
-4. CONNECT EDGES → Update agent.py
-   - Use priority for feedback edges (negative priority)
-   - (Optional) Validate with validate_graph
-5. FINALIZE → Write agent class to agent.py
-6. DONE - Agent ready at exports/my_agent/
-```
-
-**Files written immediately. MCP tools optional for validation/testing bookkeeping.**
-
-## When to Use This Skill
-
-Use hive-concepts when:
- Starting a new agent project and need to understand fundamentals
- Need to understand agent architecture before building
- Want to validate tool availability before proceeding
- Learning about node types, edges, and graph execution
-
-**Next Steps:**
- Ready to build? → Use `hive-create` skill
- Need patterns and examples? → Use `hive-patterns` skill
-
-## MCP Tools for Validation
-
-After writing files, optionally use MCP tools for validation:
-
-**test_node** - Validate node configuration with mock inputs
-```python
-mcp__agent-builder__test_node(
-    node_id="search-web",
-    test_input='{"query": "test query"}',
-    mock_llm_response='{"results": "mock output"}'
-)
-```
-
-**validate_graph** - Check graph structure
-```python
-mcp__agent-builder__validate_graph()
-# Returns: unreachable nodes, missing connections, event_loop validation, etc.
-```
-
-**configure_loop** - Set event loop parameters
-```python
-mcp__agent-builder__configure_loop(
-    max_iterations=50,
-    max_tool_calls_per_turn=10,
-    stall_detection_threshold=3,
-    max_history_tokens=32000
-)
-```
-
-**Key Point:** Files are written FIRST. MCP tools are for validation only.
-
-## Related Skills
-
- **hive-create** - Step-by-step building process
- **hive-patterns** - Best practices: judges, feedback edges, fan-out, context management
- **hive** - Complete workflow orchestrator
- **hive-test** - Test and validate completed agents
@@ -1,24 +0,0 @@
-"""
-Deep Research Agent - Interactive, rigorous research with TUI conversation.
-
-Research any topic through multi-source web search, quality evaluation,
-and synthesis. Features client-facing TUI interaction at key checkpoints
-for user guidance and iterative deepening.
-"""
-
-from .agent import DeepResearchAgent, default_agent, goal, nodes, edges
-from .config import RuntimeConfig, AgentMetadata, default_config, metadata
-
-__version__ = "1.0.0"
-
-__all__ = [
-    "DeepResearchAgent",
-    "default_agent",
-    "goal",
-    "nodes",
-    "edges",
-    "RuntimeConfig",
-    "AgentMetadata",
-    "default_config",
-    "metadata",
-]
@@ -1,241 +0,0 @@
-"""
-CLI entry point for Deep Research Agent.
-
-Uses AgentRuntime for multi-entrypoint support with HITL pause/resume.
-"""
-
-import asyncio
-import json
-import logging
-import sys
-import click
-
-from .agent import default_agent, DeepResearchAgent
-
-
-def setup_logging(verbose=False, debug=False):
-    """Configure logging for execution visibility."""
-    if debug:
-        level, fmt = logging.DEBUG, "%(asctime)s %(name)s: %(message)s"
-    elif verbose:
-        level, fmt = logging.INFO, "%(message)s"
-    else:
-        level, fmt = logging.WARNING, "%(levelname)s: %(message)s"
-    logging.basicConfig(level=level, format=fmt, stream=sys.stderr)
-    logging.getLogger("framework").setLevel(level)
-
-
-@click.group()
-@click.version_option(version="1.0.0")
-def cli():
-    """Deep Research Agent - Interactive, rigorous research with TUI conversation."""
-    pass
-
-
-@cli.command()
-@click.option("--topic", "-t", type=str, required=True, help="Research topic")
-@click.option("--mock", is_flag=True, help="Run in mock mode")
-@click.option("--quiet", "-q", is_flag=True, help="Only output result JSON")
-@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
-@click.option("--debug", is_flag=True, help="Show debug logging")
-def run(topic, mock, quiet, verbose, debug):
-    """Execute research on a topic."""
-    if not quiet:
-        setup_logging(verbose=verbose, debug=debug)
-
-    context = {"topic": topic}
-
-    result = asyncio.run(default_agent.run(context, mock_mode=mock))
-
-    output_data = {
-        "success": result.success,
-        "steps_executed": result.steps_executed,
-        "output": result.output,
-    }
-    if result.error:
-        output_data["error"] = result.error
-
-    click.echo(json.dumps(output_data, indent=2, default=str))
-    sys.exit(0 if result.success else 1)
-
-
-@cli.command()
-@click.option("--mock", is_flag=True, help="Run in mock mode")
-@click.option("--verbose", "-v", is_flag=True, help="Show execution details")
-@click.option("--debug", is_flag=True, help="Show debug logging")
-def tui(mock, verbose, debug):
-    """Launch the TUI dashboard for interactive research."""
-    setup_logging(verbose=verbose, debug=debug)
-
-    try:
-        from framework.tui.app import AdenTUI
-    except ImportError:
-        click.echo(
-            "TUI requires the 'textual' package. Install with: pip install textual"
-        )
-        sys.exit(1)
-
-    from pathlib import Path
-
-    from framework.llm import LiteLLMProvider
-    from framework.runner.tool_registry import ToolRegistry
-    from framework.runtime.agent_runtime import create_agent_runtime
-    from framework.runtime.event_bus import EventBus
-    from framework.runtime.execution_stream import EntryPointSpec
-
-    async def run_with_tui():
-        agent = DeepResearchAgent()
-
-        # Build graph and tools
-        agent._event_bus = EventBus()
-        agent._tool_registry = ToolRegistry()
-
-        storage_path = Path.home() / ".hive" / "agents" / "deep_research_agent"
-        storage_path.mkdir(parents=True, exist_ok=True)
-
-        mcp_config_path = Path(__file__).parent / "mcp_servers.json"
-        if mcp_config_path.exists():
-            agent._tool_registry.load_mcp_config(mcp_config_path)
-
-        llm = None
-        if not mock:
-            llm = LiteLLMProvider(
-                model=agent.config.model,
-                api_key=agent.config.api_key,
-                api_base=agent.config.api_base,
-            )
-
-        tools = list(agent._tool_registry.get_tools().values())
-        tool_executor = agent._tool_registry.get_executor()
-        graph = agent._build_graph()
-
-        runtime = create_agent_runtime(
-            graph=graph,
-            goal=agent.goal,
-            storage_path=storage_path,
-            entry_points=[
-                EntryPointSpec(
-                    id="start",
-                    name="Start Research",
-                    entry_node="intake",
-                    trigger_type="manual",
-                    isolation_level="isolated",
-                ),
-            ],
-            llm=llm,
-            tools=tools,
-            tool_executor=tool_executor,
-        )
-
-        await runtime.start()
-
-        try:
-            app = AdenTUI(runtime)
-            await app.run_async()
-        finally:
-            await runtime.stop()
-
-    asyncio.run(run_with_tui())
-
-
-@cli.command()
-@click.option("--json", "output_json", is_flag=True)
-def info(output_json):
-    """Show agent information."""
-    info_data = default_agent.info()
-    if output_json:
-        click.echo(json.dumps(info_data, indent=2))
-    else:
-        click.echo(f"Agent: {info_data['name']}")
-        click.echo(f"Version: {info_data['version']}")
-        click.echo(f"Description: {info_data['description']}")
-        click.echo(f"\nNodes: {', '.join(info_data['nodes'])}")
-        click.echo(f"Client-facing: {', '.join(info_data['client_facing_nodes'])}")
-        click.echo(f"Entry: {info_data['entry_node']}")
-        click.echo(f"Terminal: {', '.join(info_data['terminal_nodes'])}")
-
-
-@cli.command()
-def validate():
-    """Validate agent structure."""
-    validation = default_agent.validate()
-    if validation["valid"]:
-        click.echo("Agent is valid")
-        if validation["warnings"]:
-            for warning in validation["warnings"]:
-                click.echo(f"  WARNING: {warning}")
-    else:
-        click.echo("Agent has errors:")
-        for error in validation["errors"]:
-            click.echo(f"  ERROR: {error}")
-    sys.exit(0 if validation["valid"] else 1)
-
-
-@cli.command()
-@click.option("--verbose", "-v", is_flag=True)
-def shell(verbose):
-    """Interactive research session (CLI, no TUI)."""
-    asyncio.run(_interactive_shell(verbose))
-
-
-async def _interactive_shell(verbose=False):
-    """Async interactive shell."""
-    setup_logging(verbose=verbose)
-
-    click.echo("=== Deep Research Agent ===")
-    click.echo("Enter a topic to research (or 'quit' to exit):\n")
-
-    agent = DeepResearchAgent()
-    await agent.start()
-
-    try:
-        while True:
-            try:
-                topic = await asyncio.get_event_loop().run_in_executor(
-                    None, input, "Topic> "
-                )
-                if topic.lower() in ["quit", "exit", "q"]:
-                    click.echo("Goodbye!")
-                    break
-
-                if not topic.strip():
-                    continue
-
-                click.echo("\nResearching...\n")
-
-                result = await agent.trigger_and_wait("start", {"topic": topic})
-
-                if result is None:
-                    click.echo("\n[Execution timed out]\n")
-                    continue
-
-                if result.success:
-                    output = result.output
-                    if "report_content" in output:
-                        click.echo("\n--- Report ---\n")
-                        click.echo(output["report_content"])
-                        click.echo("\n")
-                    if "references" in output:
-                        click.echo("--- References ---\n")
-                        for ref in output.get("references", []):
-                            click.echo(
-                                f"  [{ref.get('number', '?')}] {ref.get('title', '')} - {ref.get('url', '')}"
-                            )
-                        click.echo("\n")
-                else:
-                    click.echo(f"\nResearch failed: {result.error}\n")
-
-            except KeyboardInterrupt:
-                click.echo("\nGoodbye!")
-                break
-            except Exception as e:
-                click.echo(f"Error: {e}", err=True)
-                import traceback
-
-                traceback.print_exc()
-    finally:
-        await agent.stop()
-
-
-if __name__ == "__main__":
-    cli()
@@ -1,26 +0,0 @@
-"""Runtime configuration."""
-
-from dataclasses import dataclass
-
-from framework.config import RuntimeConfig
-
-default_config = RuntimeConfig()
-
-
-@dataclass
-class AgentMetadata:
-    name: str = "Deep Research Agent"
-    version: str = "1.0.0"
-    description: str = (
-        "Interactive research agent that rigorously investigates topics through "
-        "multi-source search, quality evaluation, and synthesis - with TUI conversation "
-        "at key checkpoints for user guidance and feedback."
-    )
-    intro_message: str = (
-        "Hi! I'm your deep research assistant. Tell me a topic and I'll investigate it "
-        "thoroughly — searching multiple sources, evaluating quality, and synthesizing "
-        "a comprehensive report. What would you like me to research?"
-    )
-
-
-metadata = AgentMetadata()
@@ -1,9 +0,0 @@
-{
-  "hive-tools": {
-    "transport": "stdio",
-    "command": "uv",
-    "args": ["run", "python", "mcp_server.py", "--stdio"],
-    "cwd": "../../tools",
-    "description": "Hive tools MCP server providing web_search, web_scrape, and write_to_file"
-  }
-}
@@ -1,213 +0,0 @@
-"""Node definitions for Deep Research Agent."""
-
-from framework.graph import NodeSpec
-
-# Node 1: Intake (client-facing)
-# Brief conversation to clarify what the user wants researched.
-intake_node = NodeSpec(
-    id="intake",
-    name="Research Intake",
-    description="Discuss the research topic with the user, clarify scope, and confirm direction",
-    node_type="event_loop",
-    client_facing=True,
-    max_node_visits=0,
-    input_keys=["topic"],
-    output_keys=["research_brief"],
-    success_criteria=(
-        "The research brief is specific and actionable: it states the topic, "
-        "the key questions to answer, the desired scope, and depth."
-    ),
-    system_prompt="""\
-You are a research intake specialist. The user wants to research a topic.
-Have a brief conversation to clarify what they need.
-
-**STEP 1 — Read and respond (text only, NO tool calls):**
-1. Read the topic provided
-2. If it's vague, ask 1-2 clarifying questions (scope, angle, depth)
-3. If it's already clear, confirm your understanding and ask the user to confirm
-
-Keep it short. Don't over-ask.
-
-**STEP 2 — After the user confirms, call set_output:**
- set_output("research_brief", "A clear paragraph describing exactly what to research, \
-what questions to answer, what scope to cover, and how deep to go.")
-""",
-    tools=[],
-)
-
-# Node 2: Research
-# The workhorse — searches the web, fetches content, analyzes sources.
-# One node with both tools avoids the context-passing overhead of 5 separate nodes.
-research_node = NodeSpec(
-    id="research",
-    name="Research",
-    description="Search the web, fetch source content, and compile findings",
-    node_type="event_loop",
-    max_node_visits=0,
-    input_keys=["research_brief", "feedback"],
-    output_keys=["findings", "sources", "gaps"],
-    nullable_output_keys=["feedback"],
-    success_criteria=(
-        "Findings reference at least 3 distinct sources with URLs. "
-        "Key claims are substantiated by fetched content, not generated."
-    ),
-    system_prompt="""\
-You are a research agent. Given a research brief, find and analyze sources.
-
-If feedback is provided, this is a follow-up round — focus on the gaps identified.
-
-Work in phases:
-1. **Search**: Use web_search with 3-5 diverse queries covering different angles.
-   Prioritize authoritative sources (.edu, .gov, established publications).
-2. **Fetch**: Use web_scrape on the most promising URLs (aim for 5-8 sources).
-   Skip URLs that fail. Extract the substantive content.
-3. **Analyze**: Review what you've collected. Identify key findings, themes,
-   and any contradictions between sources.
-
-Important:
- Work in batches of 3-4 tool calls at a time — never more than 10 per turn
- After each batch, assess whether you have enough material
- Prefer quality over quantity — 5 good sources beat 15 thin ones
- Track which URL each finding comes from (you'll need citations later)
- Call set_output for each key in a SEPARATE turn (not in the same turn as other tool calls)
-
-Context management:
- Your tool results are automatically saved to files. After compaction, the file \
-references remain in the conversation — use load_data() to recover any content you need.
- Use append_data('research_notes.md', ...) to maintain a running log of key findings \
-as you go. This survives compaction and helps the report node produce a detailed report.
-
-When done, use set_output (one key at a time, separate turns):
- set_output("findings", "Structured summary: key findings with source URLs for each claim. \
-Include themes, contradictions, and confidence levels.")
- set_output("sources", [{"url": "...", "title": "...", "summary": "..."}])
- set_output("gaps", "What aspects of the research brief are NOT well-covered yet, if any.")
-""",
-    tools=[
-        "web_search",
-        "web_scrape",
-        "load_data",
-        "save_data",
-        "append_data",
-        "list_data_files",
-    ],
-)
-
-# Node 3: Review (client-facing)
-# Shows the user what was found and asks whether to dig deeper or proceed.
-review_node = NodeSpec(
-    id="review",
-    name="Review Findings",
-    description="Present findings to user and decide whether to research more or write the report",
-    node_type="event_loop",
-    client_facing=True,
-    max_node_visits=0,
-    input_keys=["findings", "sources", "gaps", "research_brief"],
-    output_keys=["needs_more_research", "feedback"],
-    success_criteria=(
-        "The user has been presented with findings and has explicitly indicated "
-        "whether they want more research or are ready for the report."
-    ),
-    system_prompt="""\
-Present the research findings to the user clearly and concisely.
-
-**STEP 1 — Present (your first message, text only, NO tool calls):**
-1. **Summary** (2-3 sentences of what was found)
-2. **Key Findings** (bulleted, with confidence levels)
-3. **Sources Used** (count and quality assessment)
-4. **Gaps** (what's still unclear or under-covered)
-
-End by asking: Are they satisfied, or do they want deeper research? \
-Should we proceed to writing the final report?
-
-**STEP 2 — After the user responds, call set_output:**
- set_output("needs_more_research", "true")  — if they want more
- set_output("needs_more_research", "false") — if they're satisfied
- set_output("feedback", "What the user wants explored further, or empty string")
-""",
-    tools=[],
-)
-
-# Node 4: Report (client-facing)
-# Writes an HTML report, serves the link to the user, and answers follow-ups.
-report_node = NodeSpec(
-    id="report",
-    name="Write & Deliver Report",
-    description="Write a cited HTML report from the findings and present it to the user",
-    node_type="event_loop",
-    client_facing=True,
-    max_node_visits=0,
-    input_keys=["findings", "sources", "research_brief"],
-    output_keys=["delivery_status", "next_action"],
-    success_criteria=(
-        "An HTML report has been saved, the file link has been presented to the user, "
-        "and the user has indicated what they want to do next."
-    ),
-    system_prompt="""\
-Write a research report as an HTML file and present it to the user.
-
-IMPORTANT: save_data requires TWO separate arguments: filename and data.
-Call it like: save_data(filename="report.html", data="<html>...</html>")
-Do NOT use _raw, do NOT nest arguments inside a JSON string.
-
-**STEP 1 — Write and save the HTML report (tool calls, NO text to user yet):**
-
-Build a clean HTML document. Keep the HTML concise — aim for clarity over length.
-Use minimal embedded CSS (a few lines of style, not a full framework).
-
-Report structure:
- Title & date
- Executive Summary (2-3 paragraphs)
- Key Findings (organized by theme, with [n] citation links)
- Analysis (synthesis, implications)
- Conclusion (key takeaways)
- References (numbered list with clickable URLs)
-
-Requirements:
- Every factual claim must cite its source with [n] notation
- Be objective — present multiple viewpoints where sources disagree
- Answer the original research questions from the brief
- If findings appear incomplete or summarized, call list_data_files() and load_data() \
-to access the detailed source material from the research phase. The research node's \
-tool results and research_notes.md contain the full data.
-
-Save the HTML:
-  save_data(filename="report.html", data="<html>...</html>")
-
-Then get the clickable link:
-  serve_file_to_user(filename="report.html", label="Research Report")
-
-If save_data fails, simplify and shorten the HTML, then retry.
-
-**STEP 2 — Present the link to the user (text only, NO tool calls):**
-
-Tell the user the report is ready and include the file:// URI from
-serve_file_to_user so they can click it to open. Give a brief summary
-of what the report covers. Ask if they have questions or want to continue.
-
-**STEP 3 — After the user responds:**
- Answer any follow-up questions from the research material
- When the user is ready to move on, ask what they'd like to do next:
-  - Research a new topic?
-  - Dig deeper into the current topic?
- Then call set_output:
-  - set_output("delivery_status", "completed")
-  - set_output("next_action", "new_topic")       — if they want a new topic
-  - set_output("next_action", "more_research")   — if they want deeper research
-""",
-    tools=[
-        "save_data",
-        "append_data",
-        "edit_data",
-        "serve_file_to_user",
-        "load_data",
-        "list_data_files",
-    ],
-)
-
-__all__ = [
-    "intake_node",
-    "research_node",
-    "review_node",
-    "report_node",
-]
@@ -1,640 +0,0 @@
---
-name: hive-credentials
-description: Set up and install credentials for an agent. Detects missing credentials from agent config, collects them from the user, and stores them securely in the local encrypted store at ~/.hive/credentials.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "2.3"
-  type: utility
---
-
-# Setup Credentials
-
-Interactive credential setup for agents with multiple authentication options. Detects what's missing, offers auth method choices, validates with health checks, and stores credentials securely.
-
-## When to Use
-
- Before running or testing an agent for the first time
- When `AgentRunner.run()` fails with "missing required credentials"
- When a user asks to configure credentials for an agent
- After building a new agent that uses tools requiring API keys
-
-## Workflow
-
-### Step 1: Identify the Agent
-
-Determine which agent needs credentials. The user will either:
-
- Name the agent directly (e.g., "set up credentials for hubspot-agent")
- Have an agent directory open (check `exports/` for agent dirs)
- Be working on an agent in the current session
-
-Locate the agent's directory under `exports/{agent_name}/`.
-
-### Step 2: Detect Missing Credentials
-
-Use the `check_missing_credentials` MCP tool to detect what the agent needs and what's already configured. This tool loads the agent, inspects its required tools and node types, maps them to credentials via `CREDENTIAL_SPECS`, and checks both the encrypted store and environment variables.
-
-```
-check_missing_credentials(agent_path="exports/{agent_name}")
-```
-
-The tool returns a JSON response:
-
-```json
-{
-  "agent": "exports/{agent_name}",
-  "missing": [
-    {
-      "credential_name": "brave_search",
-      "env_var": "BRAVE_SEARCH_API_KEY",
-      "description": "Brave Search API key for web search",
-      "help_url": "https://brave.com/search/api/",
-      "tools": ["web_search"]
-    }
-  ],
-  "available": [
-    {
-      "credential_name": "anthropic",
-      "env_var": "ANTHROPIC_API_KEY",
-      "source": "encrypted_store"
-    }
-  ],
-  "total_missing": 1,
-  "ready": false
-}
-```
-
-**If `ready` is true (nothing missing):** Report all credentials as configured and skip Steps 3-5. Example:
-
-```
-All required credentials are already configured:
-  ✓ anthropic (ANTHROPIC_API_KEY)
-  ✓ brave_search (BRAVE_SEARCH_API_KEY)
-Your agent is ready to run!
-```
-
-**If credentials are missing:** Continue to Step 3 with the `missing` list.
-
-### Step 3: Present Auth Options for Each Missing Credential
-
-For each missing credential, check what authentication methods are available:
-
-```python
-from aden_tools.credentials import CREDENTIAL_SPECS
-
-spec = CREDENTIAL_SPECS.get("hubspot")
-if spec:
-    # Determine available auth options
-    auth_options = []
-    if spec.aden_supported:
-        auth_options.append("aden")
-    if spec.direct_api_key_supported:
-        auth_options.append("direct")
-    auth_options.append("custom")  # Always available
-
-    # Get setup info
-    setup_info = {
-        "env_var": spec.env_var,
-        "description": spec.description,
-        "help_url": spec.help_url,
-        "api_key_instructions": spec.api_key_instructions,
-    }
-```
-
-Present the available options using AskUserQuestion:
-
-```
-Choose how to configure HUBSPOT_ACCESS_TOKEN:
-
-  1) Aden Platform (OAuth) (Recommended)
-     Secure OAuth2 flow via hive.adenhq.com
-     - Quick setup with automatic token refresh
-     - No need to manage API keys manually
-
-  2) Direct API Key
-     Enter your own API key manually
-     - Requires creating a HubSpot Private App
-     - Full control over scopes and permissions
-
-  3) Local Credential Setup (Advanced)
-     Programmatic configuration for CI/CD
-     - For automated deployments
-     - Requires manual API calls
-```
-
-### Step 4: Execute Auth Flow Based on User Choice
-
-#### Prerequisite: Ensure HIVE_CREDENTIAL_KEY Is Available
-
-Before storing any credentials, verify `HIVE_CREDENTIAL_KEY` is set (needed to encrypt/decrypt the local store). Check both the current session and shell config:
-
-```bash
-# Check current session
-printenv HIVE_CREDENTIAL_KEY > /dev/null 2>&1 && echo "session: set" || echo "session: not set"
-
-# Check shell config files
-for f in ~/.zshrc ~/.bashrc ~/.profile; do [ -f "$f" ] && grep -q 'HIVE_CREDENTIAL_KEY' "$f" && echo "$f"; done
-```
-
- **In current session** — proceed to store credentials
- **In shell config but NOT in current session** — run `source ~/.zshrc` (or `~/.bashrc`) first, then proceed
- **Not set anywhere** — `EncryptedFileStorage` will auto-generate one. After storing, tell the user to persist it: `export HIVE_CREDENTIAL_KEY="{generated_key}"` in their shell profile
-
-> **⚠️ IMPORTANT: After adding `HIVE_CREDENTIAL_KEY` to the user's shell config, always display:**
-> ```
-> ⚠️  Environment variables were added to your shell config.
->     Open a NEW TERMINAL for them to take effect outside this session.
-> ```
-
-#### Option 1: Aden Platform (OAuth)
-
-This is the recommended flow for supported integrations (HubSpot, etc.).
-
-**How Aden OAuth Works:**
-
-The ADEN_API_KEY represents a user who has already completed OAuth authorization on Aden's platform. When users sign up and connect integrations on Aden, those OAuth tokens are stored server-side. Having an ADEN_API_KEY means:
-
-1. User has an Aden account
-2. User has already authorized integrations (HubSpot, etc.) via OAuth on Aden
-3. We just need to sync those credentials down to the local credential store
-
-**4.1a. Check for ADEN_API_KEY**
-
-```python
-import os
-aden_key = os.environ.get("ADEN_API_KEY")
-```
-
-If not set, guide user to get one from Aden (this is where they do OAuth):
-
-```python
-from aden_tools.credentials import open_browser, get_aden_setup_url
-
-# Open browser to Aden - user will sign up and connect integrations there
-url = get_aden_setup_url()  # https://hive.adenhq.com
-success, msg = open_browser(url)
-
-print("Please sign in to Aden and connect your integrations (HubSpot, etc.).")
-print("Once done, copy your API key and return here.")
-```
-
-Ask user to provide the ADEN_API_KEY they received.
-
-**4.1b. Save ADEN_API_KEY to Shell Config**
-
-With user approval, persist ADEN_API_KEY to their shell config:
-
-```python
-from aden_tools.credentials import (
-    detect_shell,
-    add_env_var_to_shell_config,
-    get_shell_source_command,
-)
-
-shell_type = detect_shell()  # 'bash', 'zsh', or 'unknown'
-
-# Ask user for approval before modifying shell config
-# If approved:
-success, config_path = add_env_var_to_shell_config(
-    "ADEN_API_KEY",
-    user_provided_key,
-    comment="Aden Platform (OAuth) API key"
-)
-
-if success:
-    source_cmd = get_shell_source_command()
-    print(f"Saved to {config_path}")
-    print(f"Run: {source_cmd}")
-```
-
-> **⚠️ IMPORTANT: After adding `ADEN_API_KEY` to the user's shell config, always display:**
-> ```
-> ⚠️  Environment variables were added to your shell config.
->     Open a NEW TERMINAL for them to take effect outside this session.
-> ```
-
-Also save to `~/.hive/configuration.json` for the framework:
-
-```python
-import json
-from pathlib import Path
-
-config_path = Path.home() / ".hive" / "configuration.json"
-config = json.loads(config_path.read_text()) if config_path.exists() else {}
-
-config["aden"] = {
-    "api_key_configured": True,
-    "api_url": "https://api.adenhq.com"
-}
-
-config_path.parent.mkdir(parents=True, exist_ok=True)
-config_path.write_text(json.dumps(config, indent=2))
-```
-
-**4.1c. Sync Credentials from Aden Server**
-
-Since the user has already authorized integrations on Aden, use the one-liner factory method:
-
-```python
-from core.framework.credentials import CredentialStore
-
-# This single call handles everything:
-# - Creates encrypted local storage at ~/.hive/credentials
-# - Configures Aden client from ADEN_API_KEY env var
-# - Syncs all credentials from Aden server automatically
-store = CredentialStore.with_aden_sync(
-    base_url="https://api.adenhq.com",
-    auto_sync=True,  # Syncs on creation
-)
-
-# Check what was synced
-synced = store.list_credentials()
-print(f"Synced credentials: {synced}")
-
-# If the required credential wasn't synced, the user hasn't authorized it on Aden yet
-if "hubspot" not in synced:
-    print("HubSpot not found in your Aden account.")
-    print("Please visit https://hive.adenhq.com to connect HubSpot, then try again.")
-```
-
-For more control over the sync process:
-
-```python
-from core.framework.credentials import CredentialStore
-from core.framework.credentials.aden import (
-    AdenCredentialClient,
-    AdenClientConfig,
-    AdenSyncProvider,
-)
-
-# Create client (API key loaded from ADEN_API_KEY env var)
-client = AdenCredentialClient(AdenClientConfig(
-    base_url="https://api.adenhq.com",
-))
-
-# Create provider and store
-provider = AdenSyncProvider(client=client)
-store = CredentialStore.with_encrypted_storage()
-
-# Manual sync
-synced_count = provider.sync_all(store)
-print(f"Synced {synced_count} credentials from Aden")
-```
-
-**4.1d. Run Health Check**
-
-```python
-from aden_tools.credentials import check_credential_health
-
-# Get the token from the store
-cred = store.get_credential("hubspot")
-token = cred.keys["access_token"].value.get_secret_value()
-
-result = check_credential_health("hubspot", token)
-if result.valid:
-    print("HubSpot credentials validated successfully!")
-else:
-    print(f"Validation failed: {result.message}")
-    # Offer to retry the OAuth flow
-```
-
-#### Option 2: Direct API Key
-
-For users who prefer manual API key management.
-
-**4.2a. Show Setup Instructions**
-
-```python
-from aden_tools.credentials import CREDENTIAL_SPECS
-
-spec = CREDENTIAL_SPECS.get("hubspot")
-if spec and spec.api_key_instructions:
-    print(spec.api_key_instructions)
-# Output:
-# To get a HubSpot Private App token:
-# 1. Go to HubSpot Settings > Integrations > Private Apps
-# 2. Click "Create a private app"
-# 3. Name your app (e.g., "Hive Agent")
-# ...
-
-if spec and spec.help_url:
-    print(f"More info: {spec.help_url}")
-```
-
-**4.2b. Collect API Key from User**
-
-Use AskUserQuestion to securely collect the API key:
-
-```
-Please provide your HubSpot access token:
-(This will be stored securely in ~/.hive/credentials)
-```
-
-**4.2c. Run Health Check Before Storing**
-
-```python
-from aden_tools.credentials import check_credential_health
-
-result = check_credential_health("hubspot", user_provided_token)
-if not result.valid:
-    print(f"Warning: {result.message}")
-    # Ask user if they want to:
-    # 1. Try a different token
-    # 2. Continue anyway (not recommended)
-```
-
-**4.2d. Store in Local Encrypted Store**
-
-```python
-from core.framework.credentials import CredentialStore, CredentialObject, CredentialKey
-from pydantic import SecretStr
-
-store = CredentialStore.with_encrypted_storage()
-
-cred = CredentialObject(
-    id="hubspot",
-    name="HubSpot Access Token",
-    keys={
-        "access_token": CredentialKey(
-            name="access_token",
-            value=SecretStr(user_provided_token),
-        )
-    },
-)
-store.save_credential(cred)
-```
-
-**4.2e. Export to Current Session**
-
-```bash
-export HUBSPOT_ACCESS_TOKEN="the-value"
-```
-
-#### Option 3: Local Credential Setup (Advanced)
-
-For programmatic/CI/CD setups.
-
-**4.3a. Show Documentation**
-
-```
-For advanced credential management, you can use the CredentialStore API directly:
-
-  from core.framework.credentials import CredentialStore, CredentialObject, CredentialKey
-  from pydantic import SecretStr
-
-  store = CredentialStore.with_encrypted_storage()
-
-  cred = CredentialObject(
-      id="hubspot",
-      name="HubSpot Access Token",
-      keys={"access_token": CredentialKey(name="access_token", value=SecretStr("..."))}
-  )
-  store.save_credential(cred)
-
-For CI/CD environments:
-  - Set HIVE_CREDENTIAL_KEY for encryption
-  - Pre-populate ~/.hive/credentials programmatically
-  - Or use environment variables directly (HUBSPOT_ACCESS_TOKEN)
-
-Documentation: See core/framework/credentials/README.md
-```
-
-### Step 5: Record Configuration Method
-
-Track which auth method was used for each credential in `~/.hive/configuration.json`:
-
-```python
-import json
-from pathlib import Path
-from datetime import datetime
-
-config_path = Path.home() / ".hive" / "configuration.json"
-config = json.loads(config_path.read_text()) if config_path.exists() else {}
-
-if "credential_methods" not in config:
-    config["credential_methods"] = {}
-
-config["credential_methods"]["hubspot"] = {
-    "method": "aden",  # or "direct" or "custom"
-    "configured_at": datetime.now().isoformat(),
-}
-
-config_path.write_text(json.dumps(config, indent=2))
-```
-
-### Step 6: Verify All Credentials
-
-Use the `verify_credentials` MCP tool to confirm everything is properly configured:
-
-```
-verify_credentials(agent_path="exports/{agent_name}")
-```
-
-The tool returns:
-
-```json
-{
-  "agent": "exports/{agent_name}",
-  "ready": true,
-  "missing_credentials": [],
-  "warnings": [],
-  "errors": []
-}
-```
-
-If `ready` is true, report success. If `missing_credentials` is non-empty, identify what failed and loop back to Step 3 for the remaining credentials.
-
-## Health Check Reference
-
-Health checks validate credentials by making lightweight API calls:
-
-| Credential      | Endpoint                                | What It Checks                    |
-| --------------- | --------------------------------------- | --------------------------------- |
-| `anthropic`     | `POST /v1/messages`                     | API key validity                  |
-| `brave_search`  | `GET /res/v1/web/search?q=test&count=1` | API key validity                  |
-| `google_search` | `GET /customsearch/v1?q=test&num=1`     | API key + CSE ID validity         |
-| `github`        | `GET /user`                             | Token validity, user identity     |
-| `hubspot`       | `GET /crm/v3/objects/contacts?limit=1`  | Bearer token validity, CRM scopes |
-| `resend`        | `GET /domains`                          | API key validity                  |
-
-```python
-from aden_tools.credentials import check_credential_health, HealthCheckResult
-
-result: HealthCheckResult = check_credential_health("hubspot", token_value)
-# result.valid: bool
-# result.message: str
-# result.details: dict (status_code, rate_limited, etc.)
-```
-
-## Encryption Key (HIVE_CREDENTIAL_KEY)
-
-The local encrypted store requires `HIVE_CREDENTIAL_KEY` to encrypt/decrypt credentials.
-
- If the user doesn't have one, `EncryptedFileStorage` will auto-generate one and log it
- The user MUST persist this key (e.g., in `~/.bashrc`/`~/.zshrc` or a secrets manager)
- Without this key, stored credentials cannot be decrypted
-
-**Shell config rule:** Only TWO keys belong in shell config (`~/.zshrc`/`~/.bashrc`):
- `HIVE_CREDENTIAL_KEY` — encryption key for the credential store
- `ADEN_API_KEY` — Aden platform auth key (needed before the store can sync)
-
-All other API keys (Brave, Google, HubSpot, etc.) must go in the encrypted store only. **Never offer to add them to shell config.**
-
-If `HIVE_CREDENTIAL_KEY` is not set:
-
-1. Let the store generate one
-2. Tell the user to save it: `export HIVE_CREDENTIAL_KEY="{generated_key}"`
-3. Recommend adding it to `~/.bashrc` or their shell profile
-
-## Security Rules
-
- **NEVER** log, print, or echo credential values in tool output
- **NEVER** store credentials in plaintext files, git-tracked files, or agent configs
- **NEVER** hardcode credentials in source code
- **NEVER** offer to save API keys to shell config (`~/.zshrc`/`~/.bashrc`) — the **only** keys that belong in shell config are `HIVE_CREDENTIAL_KEY` and `ADEN_API_KEY`. All other credentials (Brave, Google, HubSpot, GitHub, Resend, etc.) go in the encrypted store only.
- **ALWAYS** use `SecretStr` from Pydantic when handling credential values in Python
- **ALWAYS** use the local encrypted store (`~/.hive/credentials`) for persistence
- **ALWAYS** run health checks before storing credentials (when possible)
- **ALWAYS** verify credentials were stored by re-running validation, not by reading them back
- When modifying `~/.bashrc` or `~/.zshrc`, confirm with the user first
-
-## Credential Sources Reference
-
-All credential specs are defined in `tools/src/aden_tools/credentials/`:
-
-| File              | Category      | Credentials                                   | Aden Supported |
-| ----------------- | ------------- | --------------------------------------------- | -------------- |
-| `llm.py`          | LLM Providers | `anthropic`                                   | No             |
-| `search.py`       | Search Tools  | `brave_search`, `google_search`, `google_cse` | No             |
-| `email.py`        | Email         | `resend`                                      | No             |
-| `integrations.py` | Integrations  | `github`, `hubspot`, `google_calendar_oauth`  | No / Yes       |
-
-**Note:** Additional LLM providers (Cerebras, Groq, OpenAI) are handled by LiteLLM via environment
-variables (`CEREBRAS_API_KEY`, `GROQ_API_KEY`, `OPENAI_API_KEY`) but are not yet in CREDENTIAL_SPECS.
-Add them to `llm.py` as needed.
-
-To check what's registered:
-
-```python
-from aden_tools.credentials import CREDENTIAL_SPECS
-for name, spec in CREDENTIAL_SPECS.items():
-    print(f"{name}: aden={spec.aden_supported}, direct={spec.direct_api_key_supported}")
-```
-
-## Migration: CredentialManager → CredentialStore
-
-**CredentialManager is deprecated.** Use CredentialStore instead.
-
-| Old (Deprecated)                          | New (Recommended)                                                    |
-| ----------------------------------------- | -------------------------------------------------------------------- |
-| `CredentialManager()`                     | `CredentialStore.with_encrypted_storage()`                           |
-| `creds.get("hubspot")`                    | `store.get("hubspot")` or `store.get_key("hubspot", "access_token")` |
-| `creds.validate_for_tools(tools)`         | Use `store.is_available(cred_id)` per credential                     |
-| `creds.get_auth_options("hubspot")`       | Check `CREDENTIAL_SPECS["hubspot"].aden_supported`                   |
-| `creds.get_setup_instructions("hubspot")` | Access `CREDENTIAL_SPECS["hubspot"]` directly                        |
-
-**Why migrate?**
-
- **CredentialStore** supports encrypted storage, multi-key credentials, template resolution, and automatic token refresh
- **CredentialManager** only reads from environment variables and .env files (no encryption, no refresh)
- **CredentialStoreAdapter** exists for backward compatibility during migration
-
-```python
-# Old way (deprecated)
-from aden_tools.credentials import CredentialManager
-creds = CredentialManager()
-token = creds.get("hubspot")
-
-# New way (recommended)
-from core.framework.credentials import CredentialStore
-store = CredentialStore.with_encrypted_storage()
-token = store.get("hubspot")
-
-# With Aden sync (recommended for OAuth integrations)
-store = CredentialStore.with_aden_sync()
-token = store.get_key("hubspot", "access_token")
-```
-
-## Example Session
-
-```
-User: /hive-credentials for my research-agent
-
-Agent: Let me check what credentials your research-agent needs.
-
-[Calls check_missing_credentials(agent_path="exports/research-agent")]
-→ Returns:
-  available: anthropic (encrypted_store), brave_search (encrypted_store)
-  missing: google_search (GOOGLE_API_KEY), google_cse (GOOGLE_CSE_ID)
-  ready: false
-
-Agent: 2 of 4 required credentials are already configured. Only Google Custom
-Search needs setup (2 values).
-
--- Setting up Google Custom Search (google_search + google_cse) ---
-
-This requires two values that work together.
-
-[Checks HIVE_CREDENTIAL_KEY before storing]
-$ printenv HIVE_CREDENTIAL_KEY > /dev/null 2>&1 && echo "set" || echo "not set"
-set
-
-First, the Google API Key:
-1. Go to https://console.cloud.google.com/apis/credentials
-2. Create a new project (or select an existing one)
-3. Enable the "Custom Search API" from the API Library
-4. Go to Credentials > Create Credentials > API Key
-5. Copy the generated API key
-
-[AskUserQuestion: "Please provide your Google API key:"]
-[User provides key]
-
-Now, the Custom Search Engine ID:
-1. Go to https://programmablesearchengine.google.com/controlpanel/all
-2. Click "Add" to create a new search engine
-3. Under "What to search", select "Search the entire web"
-4. Give your search engine a name
-5. Click "Create"
-6. Copy the Search Engine ID (cx value)
-
-[AskUserQuestion: "Please provide your Google CSE ID:"]
-[User provides ID]
-
-[Runs health check with both values - GET /customsearch/v1?q=test&num=1 → 200 OK]
-[Stores both in local encrypted store, exports to env]
-
-✓ Google Custom Search credentials valid
-
-[Calls verify_credentials(agent_path="exports/research-agent")]
-→ Returns: ready: true, missing_credentials: []
-
-All credentials are now configured:
-  ✓ anthropic (ANTHROPIC_API_KEY) — already in encrypted store
-  ✓ brave_search (BRAVE_SEARCH_API_KEY) — already in encrypted store
-  ✓ google_search (GOOGLE_API_KEY) — stored in encrypted store
-  ✓ google_cse (GOOGLE_CSE_ID) — stored in encrypted store
-
-┌─────────────────────────────────────────────────────────────────────────────┐
-│                      ✅ CREDENTIALS CONFIGURED                              │
-├─────────────────────────────────────────────────────────────────────────────┤
-│                                                                             │
-│     OPEN A NEW TERMINAL before running commands below.                      │
-│     Environment variables were saved to your shell config but               │
-│     only take effect in new terminal sessions.                              │
-│                                                                             │
-│  NEXT STEPS:                                                                │
-│                                                                             │
-│  1. RUN YOUR AGENT:                                                         │
-│                                                                             │
-│     hive tui                                                                │
-│                                                                             │
-│  2. IF YOU ENCOUNTER ISSUES, USE THE DEBUGGER:                              │
-│                                                                             │
-│     /hive-debugger                                                          │
-│                                                                             │
-│     The debugger analyzes runtime logs, identifies retry loops, tool        │
-│     failures, stalled execution, and provides actionable fix suggestions.   │
-│                                                                             │
-└─────────────────────────────────────────────────────────────────────────────┘
-```
@@ -1,385 +0,0 @@
---
-name: hive-patterns
-description: Best practices, patterns, and examples for building goal-driven agents. Includes client-facing interaction, feedback edges, judge patterns, fan-out/fan-in, context management, and anti-patterns.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "2.0"
-  type: reference
-  part_of: hive
---
-
-# Building Agents - Patterns & Best Practices
-
-Design patterns, examples, and best practices for building robust goal-driven agents.
-
-**Prerequisites:** Complete agent structure using `hive-create`.
-
-## Practical Example: Hybrid Workflow
-
-How to build a node using both direct file writes and optional MCP validation:
-
-```python
-# 1. WRITE TO FILE FIRST (Primary - makes it visible)
-node_code = '''
-search_node = NodeSpec(
-    id="search-web",
-    node_type="event_loop",
-    input_keys=["query"],
-    output_keys=["search_results"],
-    system_prompt="Search the web for: {query}. Use web_search, then call set_output to store results.",
-    tools=["web_search"],
-)
-'''
-
-Edit(
-    file_path="exports/research_agent/nodes/__init__.py",
-    old_string="# Nodes will be added here",
-    new_string=node_code
-)
-
-# 2. OPTIONALLY VALIDATE WITH MCP (Secondary - bookkeeping)
-validation = mcp__agent-builder__test_node(
-    node_id="search-web",
-    test_input='{"query": "python tutorials"}',
-    mock_llm_response='{"search_results": [...mock results...]}'
-)
-```
-
-**User experience:**
-
- Immediately sees node in their editor (from step 1)
- Gets validation feedback (from step 2)
- Can edit the file directly if needed
-
-## Multi-Turn Interaction Patterns
-
-For agents needing multi-turn conversations with users, use `client_facing=True` on event_loop nodes.
-
-### Client-Facing Nodes
-
-A client-facing node streams LLM output to the user and blocks for user input between conversational turns. This replaces the old pause/resume pattern.
-
-```python
-# Client-facing node with STEP 1/STEP 2 prompt pattern
-intake_node = NodeSpec(
-    id="intake",
-    name="Intake",
-    description="Gather requirements from the user",
-    node_type="event_loop",
-    client_facing=True,
-    input_keys=["topic"],
-    output_keys=["research_brief"],
-    system_prompt="""\
-You are an intake specialist.
-
-**STEP 1 — Read and respond (text only, NO tool calls):**
-1. Read the topic provided
-2. If it's vague, ask 1-2 clarifying questions
-3. If it's clear, confirm your understanding
-
-**STEP 2 — After the user confirms, call set_output:**
- set_output("research_brief", "Clear description of what to research")
-""",
-)
-
-# Internal node runs without user interaction
-research_node = NodeSpec(
-    id="research",
-    name="Research",
-    description="Search and analyze sources",
-    node_type="event_loop",
-    input_keys=["research_brief"],
-    output_keys=["findings", "sources"],
-    system_prompt="Research the topic using web_search and web_scrape...",
-    tools=["web_search", "web_scrape", "load_data", "save_data"],
-)
-```
-
-**How it works:**
-
- Client-facing nodes stream LLM text to the user and block for input after each response
- User input is injected via `node.inject_event(text)`
- When the LLM calls `set_output` to produce structured outputs, the judge evaluates and ACCEPTs
- Internal nodes (non-client-facing) run their entire loop without blocking
- `set_output` is a synthetic tool — a turn with only `set_output` calls (no real tools) triggers user input blocking
-
-**STEP 1/STEP 2 pattern:** Always structure client-facing prompts with explicit phases. STEP 1 is text-only conversation. STEP 2 calls `set_output` after user confirmation. This prevents the LLM from calling `set_output` prematurely before the user responds.
-
-### When to Use client_facing
-
-| Scenario                            | client_facing | Why                    |
-| ----------------------------------- | :-----------: | ---------------------- |
-| Gathering user requirements         |      Yes      | Need user input        |
-| Human review/approval checkpoint    |      Yes      | Need human decision    |
-| Data processing (scanning, scoring) |      No       | Runs autonomously      |
-| Report generation                   |      No       | No user input needed   |
-| Final confirmation before action    |      Yes      | Need explicit approval |
-
-> **Legacy Note:** The `pause_nodes` / `entry_points` pattern still works for backward compatibility but `client_facing=True` is preferred for new agents.
-
-## Edge-Based Routing and Feedback Loops
-
-### Conditional Edge Routing
-
-Multiple conditional edges from the same source replace the old `router` node type. Each edge checks a condition on the node's output.
-
-```python
-# Node with mutually exclusive outputs
-review_node = NodeSpec(
-    id="review",
-    name="Review",
-    node_type="event_loop",
-    client_facing=True,
-    output_keys=["approved_contacts", "redo_extraction"],
-    nullable_output_keys=["approved_contacts", "redo_extraction"],
-    max_node_visits=3,
-    system_prompt="Present the contact list to the operator. If they approve, call set_output('approved_contacts', ...). If they want changes, call set_output('redo_extraction', 'true').",
-)
-
-# Forward edge (positive priority, evaluated first)
-EdgeSpec(
-    id="review-to-campaign",
-    source="review",
-    target="campaign-builder",
-    condition=EdgeCondition.CONDITIONAL,
-    condition_expr="output.get('approved_contacts') is not None",
-    priority=1,
-)
-
-# Feedback edge (negative priority, evaluated after forward edges)
-EdgeSpec(
-    id="review-feedback",
-    source="review",
-    target="extractor",
-    condition=EdgeCondition.CONDITIONAL,
-    condition_expr="output.get('redo_extraction') is not None",
-    priority=-1,
-)
-```
-
-**Key concepts:**
-
- `nullable_output_keys`: Lists output keys that may remain unset. The node sets exactly one of the mutually exclusive keys per execution.
- `max_node_visits`: Must be >1 on the feedback target (extractor) so it can re-execute. Default is 1.
- `priority`: Positive = forward edge (evaluated first). Negative = feedback edge. The executor tries forward edges first; if none match, falls back to feedback edges.
-
-### Routing Decision Table
-
-| Pattern                | Old Approach            | New Approach                                  |
-| ---------------------- | ----------------------- | --------------------------------------------- |
-| Conditional branching  | `router` node           | Conditional edges with `condition_expr`       |
-| Binary approve/reject  | `pause_nodes` + resume  | `client_facing=True` + `nullable_output_keys` |
-| Loop-back on rejection | Manual entry_points     | Feedback edge with `priority=-1`              |
-| Multi-way routing      | Router with routes dict | Multiple conditional edges with priorities    |
-
-## Judge Patterns
-
-**Core Principle: The judge is the SOLE mechanism for acceptance decisions.** Never add ad-hoc framework gating to compensate for LLM behavior. If the LLM calls `set_output` prematurely, fix the system prompt or use a custom judge. Anti-patterns to avoid:
-
- Output rollback logic
- `_user_has_responded` flags
- Premature set_output rejection
- Interaction protocol injection into system prompts
-
-Judges control when an event_loop node's loop exits. Choose based on validation needs.
-
-### Implicit Judge (Default)
-
-When no judge is configured, the implicit judge ACCEPTs when:
-
- The LLM finishes its response with no tool calls
- All required output keys have been set via `set_output`
-
-Best for simple nodes where "all outputs set" is sufficient validation.
-
-### SchemaJudge
-
-Validates outputs against a Pydantic model. Use when you need structural validation.
-
-```python
-from pydantic import BaseModel
-
-class ScannerOutput(BaseModel):
-    github_users: list[dict]  # Must be a list of user objects
-
-class SchemaJudge:
-    def __init__(self, output_model: type[BaseModel]):
-        self._model = output_model
-
-    async def evaluate(self, context: dict) -> JudgeVerdict:
-        missing = context.get("missing_keys", [])
-        if missing:
-            return JudgeVerdict(
-                action="RETRY",
-                feedback=f"Missing output keys: {missing}. Use set_output to provide them.",
-            )
-        try:
-            self._model.model_validate(context["output_accumulator"])
-            return JudgeVerdict(action="ACCEPT")
-        except ValidationError as e:
-            return JudgeVerdict(action="RETRY", feedback=str(e))
-```
-
-### When to Use Which Judge
-
-| Judge           | Use When                              | Example                |
-| --------------- | ------------------------------------- | ---------------------- |
-| Implicit (None) | Output keys are sufficient validation | Simple data extraction |
-| SchemaJudge     | Need structural validation of outputs | API response parsing   |
-| Custom          | Domain-specific validation logic      | Score must be 0.0-1.0  |
-
-## Fan-Out / Fan-In (Parallel Execution)
-
-Multiple ON_SUCCESS edges from the same source trigger parallel execution. All branches run concurrently via `asyncio.gather()`.
-
-```python
-# Scanner fans out to Profiler and Scorer in parallel
-EdgeSpec(id="scanner-to-profiler", source="scanner", target="profiler",
-         condition=EdgeCondition.ON_SUCCESS)
-EdgeSpec(id="scanner-to-scorer", source="scanner", target="scorer",
-         condition=EdgeCondition.ON_SUCCESS)
-
-# Both fan in to Extractor
-EdgeSpec(id="profiler-to-extractor", source="profiler", target="extractor",
-         condition=EdgeCondition.ON_SUCCESS)
-EdgeSpec(id="scorer-to-extractor", source="scorer", target="extractor",
-         condition=EdgeCondition.ON_SUCCESS)
-```
-
-**Requirements:**
-
- Parallel event_loop nodes must have **disjoint output_keys** (no key written by both)
- Only one parallel branch may contain a `client_facing` node
- Fan-in node receives outputs from all completed branches in shared memory
-
-## Context Management Patterns
-
-### Tiered Compaction
-
-EventLoopNode automatically manages context window usage with tiered compaction:
-
-1. **Pruning** — Old tool results replaced with compact placeholders (zero-cost, no LLM call)
-2. **Normal compaction** — LLM summarizes older messages
-3. **Aggressive compaction** — Keeps only recent messages + summary
-4. **Emergency** — Hard reset with tool history preservation
-
-### Spillover Pattern
-
-The framework automatically truncates large tool results and saves full content to a spillover directory. The LLM receives a truncation message with instructions to use `load_data` to read the full result.
-
-For explicit data management, use the data tools (real MCP tools, not synthetic):
-
-```python
-# save_data, load_data, list_data_files, serve_file_to_user are real MCP tools
-# data_dir is auto-injected by the framework — the LLM never sees it
-
-# Saving large results
-save_data(filename="sources.json", data=large_json_string)
-
-# Reading with pagination (line-based offset/limit)
-load_data(filename="sources.json", offset=0, limit=50)
-
-# Listing available files
-list_data_files()
-
-# Serving a file to the user as a clickable link
-serve_file_to_user(filename="report.html", label="Research Report")
-```
-
-Add data tools to nodes that handle large tool results:
-
-```python
-research_node = NodeSpec(
-    ...
-    tools=["web_search", "web_scrape", "load_data", "save_data", "list_data_files"],
-)
-```
-
-`data_dir` is a framework context parameter — auto-injected at call time. `GraphExecutor.execute()` sets it per-execution via `ToolRegistry.set_execution_context(data_dir=...)` (using `contextvars` for concurrency safety), ensuring it matches the session-scoped spillover directory.
-
-## Anti-Patterns
-
-### What NOT to Do
-
- **Don't rely on `export_graph`** — Write files immediately, not at end
- **Don't hide code in session** — Write to files as components are approved
- **Don't wait to write files** — Agent visible from first step
- **Don't batch everything** — Write incrementally, one component at a time
- **Don't create too many thin nodes** — Prefer fewer, richer nodes (see below)
- **Don't add framework gating for LLM behavior** — Fix prompts or use judges instead
-
-### Fewer, Richer Nodes
-
-A common mistake is splitting work into too many small single-purpose nodes. Each node boundary requires serializing outputs, losing in-context information, and adding edge complexity.
-
-| Bad (8 thin nodes)  | Good (4 rich nodes)                 |
-| ------------------- | ----------------------------------- |
-| parse-query         | intake (client-facing)              |
-| search-sources      | research (search + fetch + analyze) |
-| fetch-content       | review (client-facing)              |
-| evaluate-sources    | report (write + deliver)            |
-| synthesize-findings |                                     |
-| write-report        |                                     |
-| quality-check       |                                     |
-| save-report         |                                     |
-
-**Why fewer nodes are better:**
-
- The LLM retains full context of its work within a single node
- A research node that searches, fetches, and analyzes keeps all source material in its conversation history
- Fewer edges means simpler graph and fewer failure points
- Data tools (`save_data`/`load_data`) handle context window limits within a single node
-
-### MCP Tools - Correct Usage
-
-**MCP tools OK for:**
-
- `test_node` — Validate node configuration with mock inputs
- `validate_graph` — Check graph structure
- `configure_loop` — Set event loop parameters
- `create_session` — Track session state for bookkeeping
-
-**Just don't:** Use MCP as the primary construction method or rely on export_graph
-
-## Error Handling Patterns
-
-### Graceful Failure with Fallback
-
-```python
-edges = [
-    # Success path
-    EdgeSpec(id="api-success", source="api-call", target="process-results",
-             condition=EdgeCondition.ON_SUCCESS),
-    # Fallback on failure
-    EdgeSpec(id="api-to-fallback", source="api-call", target="fallback-cache",
-             condition=EdgeCondition.ON_FAILURE, priority=1),
-    # Report if fallback also fails
-    EdgeSpec(id="fallback-to-error", source="fallback-cache", target="report-error",
-             condition=EdgeCondition.ON_FAILURE, priority=1),
-]
-```
-
-## Handoff to Testing
-
-When agent is complete, transition to testing phase:
-
-### Pre-Testing Checklist
-
- [ ] Agent structure validates: `uv run python -m agent_name validate`
- [ ] All nodes defined in nodes/**init**.py
- [ ] All edges connect valid nodes with correct priorities
- [ ] Feedback edge targets have `max_node_visits > 1`
- [ ] Client-facing nodes have meaningful system prompts
- [ ] Agent can be imported: `from exports.agent_name import default_agent`
-
-## Related Skills
-
- **hive-concepts** — Fundamental concepts (node types, edges, event loop architecture)
- **hive-create** — Step-by-step building process
- **hive-test** — Test and validate agents
- **hive** — Complete workflow orchestrator
-
---
-
-**Remember: Agent is actively constructed, visible the whole time. No hidden state. No surprise exports. Just transparent, incremental file building.**
@@ -1,940 +0,0 @@
---
-name: hive-test
-description: Iterative agent testing with session recovery. Execute, analyze, fix, resume from checkpoints. Use when testing an agent, debugging test failures, or verifying fixes without re-running from scratch.
---
-
-# Agent Testing
-
-Test agents iteratively: execute, analyze failures, fix, resume from checkpoint, repeat.
-
-## When to Use
-
- Testing a newly built agent against its goal
- Debugging a failing agent iteratively
- Verifying fixes without re-running expensive early nodes
- Running final regression tests before deployment
-
-## Prerequisites
-
-1. Agent package at `exports/{agent_name}/` (built with `/hive-create`)
-2. Credentials configured (`/hive-credentials`)
-3. `ANTHROPIC_API_KEY` set (or appropriate LLM provider key)
-
-**Path distinction** (critical — don't confuse these):
- `exports/{agent_name}/` — agent source code (edit here)
- `~/.hive/agents/{agent_name}/` — runtime data: sessions, checkpoints, logs (read here)
-
---
-
-## The Iterative Test Loop
-
-This is the core workflow. Don't re-run the entire agent when a late node fails — analyze, fix, and resume from the last clean checkpoint.
-
-```
-┌──────────────────────────────────────┐
-│ PHASE 1: Generate Test Scenarios     │
-│ Goal → synthetic test inputs + tests │
-└──────────────┬───────────────────────┘
-               ↓
-┌──────────────────────────────────────┐
-│ PHASE 2: Execute                     │◄────────────────┐
-│ Run agent (CLI or pytest)            │                 │
-└──────────────┬───────────────────────┘                 │
-               ↓                                         │
-          Pass? ──yes──► PHASE 6: Final Verification     │
-               │                                         │
-               no                                        │
-               ↓                                         │
-┌──────────────────────────────────────┐                 │
-│ PHASE 3: Analyze                     │                 │
-│ Session + runtime logs + checkpoints │                 │
-└──────────────┬───────────────────────┘                 │
-               ↓                                         │
-┌──────────────────────────────────────┐                 │
-│ PHASE 4: Fix                         │                 │
-│ Prompt / code / graph / goal         │                 │
-└──────────────┬───────────────────────┘                 │
-               ↓                                         │
-┌──────────────────────────────────────┐                 │
-│ PHASE 5: Recover & Resume            │─────────────────┘
-│ Checkpoint resume OR fresh re-run    │
-└──────────────────────────────────────┘
-```
-
---
-
-### Phase 1: Generate Test Scenarios
-
-Create synthetic tests from the agent's goal, constraints, and success criteria.
-
-#### Step 1a: Read the goal
-
-```python
-# Read goal from agent.py
-Read(file_path="exports/{agent_name}/agent.py")
-# Extract the Goal definition and convert to JSON string
-```
-
-#### Step 1b: Get test guidelines
-
-```python
-# Get constraint test guidelines
-generate_constraint_tests(
-    goal_id="your-goal-id",
-    goal_json='{"id": "...", "constraints": [...]}',
-    agent_path="exports/{agent_name}"
-)
-
-# Get success criteria test guidelines
-generate_success_tests(
-    goal_id="your-goal-id",
-    goal_json='{"id": "...", "success_criteria": [...]}',
-    node_names="intake,research,review,report",
-    tool_names="web_search,web_scrape",
-    agent_path="exports/{agent_name}"
-)
-```
-
-These return `file_header`, `test_template`, `constraints_formatted`/`success_criteria_formatted`, and `test_guidelines`. They do NOT generate test code — you write the tests.
-
-#### Step 1c: Write tests
-
-```python
-Write(
-    file_path=result["output_file"],
-    content=result["file_header"] + "\n\n" + your_test_code
-)
-```
-
-#### Test writing rules
-
- Every test MUST be `async` with `@pytest.mark.asyncio`
- Every test MUST accept `runner, auto_responder, mock_mode` fixtures
- Use `await auto_responder.start()` before running, `await auto_responder.stop()` in `finally`
- Use `await runner.run(input_dict)` — this goes through AgentRunner → AgentRuntime → ExecutionStream
- Access output via `result.output.get("key")` — NEVER `result.output["key"]`
- `result.success=True` means no exception, NOT goal achieved — always check output
- Write 8-15 tests total, not 30+
- Each real test costs ~3 seconds + LLM tokens
- NEVER use `default_agent.run()` — it bypasses the runtime (no sessions, no logs, client-facing nodes hang)
-
-#### Step 1d: Check existing tests
-
-Before generating, check if tests already exist:
-
-```python
-list_tests(
-    goal_id="your-goal-id",
-    agent_path="exports/{agent_name}"
-)
-```
-
---
-
-### Phase 2: Execute
-
-Two execution paths, use the right one for your situation.
-
-#### Iterative debugging (for complex agents)
-
-Run the agent via CLI. This creates sessions with checkpoints at `~/.hive/agents/{agent_name}/sessions/`:
-
-```bash
-uv run hive run exports/{agent_name} --input '{"query": "test topic"}'
-```
-
-Sessions and checkpoints are saved automatically.
-
-**Client-facing nodes**: Agents with `client_facing=True` nodes (interactive conversation) work in headless mode when run from a real terminal — the agent streams output to stdout and reads user input from stdin via a `>>> ` prompt. In non-interactive shells (like Claude Code's Bash tool), client-facing nodes will hang because there is no stdin. For testing interactive agents from Claude Code, use `run_tests` with mock mode or have the user run the agent manually in their terminal.
-
-#### Automated regression (for CI or final verification)
-
-Use the `run_tests` MCP tool to run all pytest tests:
-
-```python
-run_tests(
-    goal_id="your-goal-id",
-    agent_path="exports/{agent_name}"
-)
-```
-
-Returns structured results:
-```json
-{
-  "overall_passed": false,
-  "summary": {"total": 12, "passed": 10, "failed": 2, "pass_rate": "83.3%"},
-  "test_results": [{"test_name": "test_success_source_diversity", "status": "failed"}],
-  "failures": [{"test_name": "test_success_source_diversity", "details": "..."}]
-}
-```
-
-**Options:**
-```python
-# Run only constraint tests
-run_tests(goal_id, agent_path, test_types='["constraint"]')
-
-# Stop on first failure
-run_tests(goal_id, agent_path, fail_fast=True)
-
-# Parallel execution
-run_tests(goal_id, agent_path, parallel=4)
-```
-
-**Note:** `run_tests` uses `AgentRunner` with `tmp_path` storage, so sessions are isolated per test run. For checkpoint-based recovery with persistent sessions, use CLI execution. Use `run_tests` for quick regression checks and final verification.
-
---
-
-### Phase 3: Analyze Failures
-
-When a test fails, drill down systematically. Don't guess — use the tools.
-
-#### Step 3a: Get error category
-
-```python
-debug_test(
-    goal_id="your-goal-id",
-    test_name="test_success_source_diversity",
-    agent_path="exports/{agent_name}"
-)
-```
-
-Returns error category (`IMPLEMENTATION_ERROR`, `ASSERTION_FAILURE`, `TIMEOUT`, `IMPORT_ERROR`, `API_ERROR`) plus full traceback and suggestions.
-
-#### Step 3b: Find the failed session
-
-```python
-list_agent_sessions(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    status="failed",
-    limit=5
-)
-```
-
-Returns session list with IDs, timestamps, current_node (where it failed), execution_quality.
-
-#### Step 3c: Inspect session state
-
-```python
-get_agent_session_state(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    session_id="session_20260209_143022_abc12345"
-)
-```
-
-Returns execution path, which node was current, step count, timestamps — but excludes memory values (to avoid context bloat). Shows `memory_keys` and `memory_size` instead.
-
-#### Step 3d: Examine runtime logs (L2/L3)
-
-```python
-# L2: Per-node success/failure, retry counts
-query_runtime_log_details(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    run_id="session_20260209_143022_abc12345",
-    needs_attention_only=True
-)
-
-# L3: Exact LLM responses, tool call inputs/outputs
-query_runtime_log_raw(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    run_id="session_20260209_143022_abc12345",
-    node_id="research"
-)
-```
-
-#### Step 3e: Inspect memory data
-
-```python
-# See what data a node actually produced
-get_agent_session_memory(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    session_id="session_20260209_143022_abc12345",
-    key="research_results"
-)
-```
-
-#### Step 3f: Find recovery points
-
-```python
-list_agent_checkpoints(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    session_id="session_20260209_143022_abc12345",
-    is_clean="true"
-)
-```
-
-Returns checkpoint summaries with IDs, types (`node_start`, `node_complete`), which node, and `is_clean` flag. Clean checkpoints are safe resume points.
-
-#### Step 3g: Compare checkpoints (optional)
-
-To understand what changed between two points in execution:
-
-```python
-compare_agent_checkpoints(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    session_id="session_20260209_143022_abc12345",
-    checkpoint_id_before="cp_node_complete_research_143030",
-    checkpoint_id_after="cp_node_complete_review_143115"
-)
-```
-
-Returns memory diff (added/removed/changed keys) and execution path diff.
-
---
-
-### Phase 4: Fix Based on Root Cause
-
-Use the analysis from Phase 3 to determine what to fix and where.
-
-| Root Cause | What to Fix | Where to Edit |
-|------------|------------|---------------|
-| **Prompt issue** — LLM produces wrong output format, misses instructions | Node `system_prompt` | `exports/{agent}/nodes/__init__.py` |
-| **Code bug** — TypeError, KeyError, logic error in Python | Agent code | `exports/{agent}/agent.py`, `nodes/__init__.py` |
-| **Graph issue** — wrong routing, missing edge, bad condition_expr | Edges, node config | `exports/{agent}/agent.py` |
-| **Tool issue** — MCP tool fails, wrong config, missing credential | Tool config | `exports/{agent}/mcp_servers.json`, `/hive-credentials` |
-| **Goal issue** — success criteria too strict/vague, wrong constraints | Goal definition | `exports/{agent}/agent.py` (goal section) |
-| **Test issue** — test expectations don't match actual agent behavior | Test code | `exports/{agent}/tests/test_*.py` |
-
-#### Fix strategies by error category
-
-**IMPLEMENTATION_ERROR** (TypeError, AttributeError, KeyError):
-```python
-# Read the failing code
-Read(file_path="exports/{agent_name}/nodes/__init__.py")
-
-# Fix the bug
-Edit(
-    file_path="exports/{agent_name}/nodes/__init__.py",
-    old_string="results.get('videos')",
-    new_string="(results or {}).get('videos', [])"
-)
-```
-
-**ASSERTION_FAILURE** (test assertions fail but agent ran successfully):
- Check if the agent's output is actually wrong → fix the prompt
- Check if the test's expectations are unrealistic → fix the test
- Use `get_agent_session_memory` to see what the agent actually produced
-
-**TIMEOUT / STALL** (agent runs too long):
- Check `node_visit_counts` for feedback loops hitting max_node_visits
- Check L3 logs for tool calls that hang
- Reduce `max_iterations` in loop_config or fix the prompt to converge faster
-
-**API_ERROR** (connection, rate limit, auth):
- Verify credentials with `/hive-credentials`
- Check MCP server configuration
-
---
-
-### Phase 5: Recover & Resume
-
-After fixing the agent, decide whether to resume or re-run.
-
-#### When to resume from checkpoint
-
-Resume when ALL of these are true:
- The fix is to a node that comes AFTER existing clean checkpoints
- Clean checkpoints exist (from a CLI execution with checkpointing)
- The early nodes are expensive (web scraping, API calls, long LLM chains)
-
-```bash
-# Resume from the last clean checkpoint before the failing node
-uv run hive run exports/{agent_name} \
-  --resume-session session_20260209_143022_abc12345 \
-  --checkpoint cp_node_complete_research_143030
-```
-
-This skips all nodes before the checkpoint and only re-runs the fixed node onward.
-
-#### When to re-run from scratch
-
-Re-run when ANY of these are true:
- The fix is to the entry node or an early node
- No checkpoints exist (e.g., agent was run via `run_tests`)
- The agent is fast (2-3 nodes, completes in seconds)
- You changed the graph structure (added/removed nodes/edges)
-
-```bash
-uv run hive run exports/{agent_name} --input '{"query": "test topic"}'
-```
-
-#### Inspecting a checkpoint before resuming
-
-```python
-get_agent_checkpoint(
-    agent_work_dir="~/.hive/agents/{agent_name}",
-    session_id="session_20260209_143022_abc12345",
-    checkpoint_id="cp_node_complete_research_143030"
-)
-```
-
-Returns the full checkpoint: shared_memory snapshot, execution_path, current_node, next_node, is_clean.
-
-#### Loop back to Phase 2
-
-After resuming or re-running, check if the fix worked. If not, go back to Phase 3.
-
---
-
-### Phase 6: Final Verification
-
-Once the iterative fix loop converges (the agent produces correct output), run the full automated test suite:
-
-```python
-run_tests(
-    goal_id="your-goal-id",
-    agent_path="exports/{agent_name}"
-)
-```
-
-All tests should pass. If not, repeat the loop for remaining failures.
-
---
-
-## Credential Requirements
-
-**CRITICAL: Testing requires ALL credentials the agent depends on.** This includes both the LLM API key AND any tool-specific credentials (HubSpot, Brave Search, etc.).
-
-### Prerequisites
-
-Before running agent tests, you MUST collect ALL required credentials from the user.
-
-**Step 1: LLM API Key (always required)**
-```bash
-export ANTHROPIC_API_KEY="your-key-here"
-```
-
-**Step 2: Tool-specific credentials (depends on agent's tools)**
-
-Inspect the agent's `mcp_servers.json` and tool configuration to determine which tools the agent uses, then check for all required credentials:
-
-```python
-from aden_tools.credentials import CredentialManager, CREDENTIAL_SPECS
-
-creds = CredentialManager()
-
-# Determine which tools the agent uses (from agent.json or mcp_servers.json)
-agent_tools = [...]  # e.g., ["hubspot_search_contacts", "web_search", ...]
-
-# Find all missing credentials for those tools
-missing = creds.get_missing_for_tools(agent_tools)
-```
-
-Common tool credentials:
-| Tool | Env Var | Help URL |
-|------|---------|----------|
-| HubSpot CRM | `HUBSPOT_ACCESS_TOKEN` | https://developers.hubspot.com/docs/api/private-apps |
-| Brave Search | `BRAVE_SEARCH_API_KEY` | https://brave.com/search/api/ |
-| Google Search | `GOOGLE_SEARCH_API_KEY` + `GOOGLE_SEARCH_CX` | https://developers.google.com/custom-search |
-
-**Why ALL credentials are required:**
- Tests need to execute the agent's LLM nodes to validate behavior
- Tools with missing credentials will return error dicts instead of real data
- Mock mode bypasses everything, providing no confidence in real-world performance
-
-### Mock Mode Limitations
-
-Mock mode (`--mock` flag or `MOCK_MODE=1`) is **ONLY for structure validation**:
-
- Validates graph structure (nodes, edges, connections)
- Validates that `AgentRunner.load()` succeeds and the agent is importable
- Does NOT execute event_loop agents — MockLLMProvider never calls `set_output`, so event_loop nodes loop forever
- Does NOT test LLM reasoning, content quality, or constraint validation
- Does NOT test real API integrations or tool use
-
-**Bottom line:** If you're testing whether an agent achieves its goal, you MUST use real credentials.
-
-### Enforcing Credentials in Tests
-
-When writing tests, **ALWAYS include credential checks**:
-
-```python
-import os
-import pytest
-from aden_tools.credentials import CredentialManager
-
-pytestmark = pytest.mark.skipif(
-    not CredentialManager().is_available("anthropic") and not os.environ.get("MOCK_MODE"),
-    reason="API key required for real testing. Set ANTHROPIC_API_KEY or use MOCK_MODE=1."
-)
-
-
-@pytest.fixture(scope="session", autouse=True)
-def check_credentials():
-    """Ensure ALL required credentials are set for real testing."""
-    creds = CredentialManager()
-    mock_mode = os.environ.get("MOCK_MODE")
-
-    if not creds.is_available("anthropic"):
-        if mock_mode:
-            print("\nRunning in MOCK MODE - structure validation only")
-        else:
-            pytest.fail(
-                "\nANTHROPIC_API_KEY not set!\n"
-                "Set API key: export ANTHROPIC_API_KEY='your-key-here'\n"
-                "Or run structure validation: MOCK_MODE=1 pytest exports/{agent}/tests/"
-            )
-
-    if not mock_mode:
-        agent_tools = []  # Update per agent
-        missing = creds.get_missing_for_tools(agent_tools)
-        if missing:
-            lines = ["\nMissing tool credentials!"]
-            for name in missing:
-                spec = creds.specs.get(name)
-                if spec:
-                    lines.append(f"  {spec.env_var} - {spec.description}")
-            pytest.fail("\n".join(lines))
-```
-
-### User Communication
-
-When the user asks to test an agent, **ALWAYS check for ALL credentials first**:
-
-1. **Identify the agent's tools** from `mcp_servers.json`
-2. **Check ALL required credentials** using `CredentialManager`
-3. **Ask the user to provide any missing credentials** before proceeding
-4. Collect ALL missing credentials in a single prompt — not one at a time
-
---
-
-## Safe Test Patterns
-
-### OutputCleaner
-
-The framework automatically validates and cleans node outputs using a fast LLM at edge traversal time. Tests should still use safe patterns because OutputCleaner may not catch all issues.
-
-### Safe Access (REQUIRED)
-
-```python
-# UNSAFE - will crash on missing keys
-approval = result.output["approval_decision"]
-category = result.output["analysis"]["category"]
-
-# SAFE - use .get() with defaults
-output = result.output or {}
-approval = output.get("approval_decision", "UNKNOWN")
-
-# SAFE - type check before operations
-analysis = output.get("analysis", {})
-if isinstance(analysis, dict):
-    category = analysis.get("category", "unknown")
-
-# SAFE - handle JSON parsing trap (LLM response as string)
-import json
-recommendation = output.get("recommendation", "{}")
-if isinstance(recommendation, str):
-    try:
-        parsed = json.loads(recommendation)
-        if isinstance(parsed, dict):
-            approval = parsed.get("approval_decision", "UNKNOWN")
-    except json.JSONDecodeError:
-        approval = "UNKNOWN"
-elif isinstance(recommendation, dict):
-    approval = recommendation.get("approval_decision", "UNKNOWN")
-
-# SAFE - type check before iteration
-items = output.get("items", [])
-if isinstance(items, list):
-    for item in items:
-        ...
-```
-
-### Helper Functions for conftest.py
-
-```python
-import json
-import re
-
-def _parse_json_from_output(result, key):
-    """Parse JSON from agent output (framework may store full LLM response as string)."""
-    response_text = result.output.get(key, "")
-    json_text = re.sub(r'```json\s*|\s*```', '', response_text).strip()
-    try:
-        return json.loads(json_text)
-    except (json.JSONDecodeError, AttributeError, TypeError):
-        return result.output.get(key)
-
-def safe_get_nested(result, key_path, default=None):
-    """Safely get nested value from result.output."""
-    output = result.output or {}
-    current = output
-    for key in key_path:
-        if isinstance(current, dict):
-            current = current.get(key)
-        elif isinstance(current, str):
-            try:
-                json_text = re.sub(r'```json\s*|\s*```', '', current).strip()
-                parsed = json.loads(json_text)
-                if isinstance(parsed, dict):
-                    current = parsed.get(key)
-                else:
-                    return default
-            except json.JSONDecodeError:
-                return default
-        else:
-            return default
-    return current if current is not None else default
-
-# Make available in tests
-pytest.parse_json_from_output = _parse_json_from_output
-pytest.safe_get_nested = safe_get_nested
-```
-
-### ExecutionResult Fields
-
-**`result.success=True` means NO exception, NOT goal achieved**
-
-```python
-# WRONG
-assert result.success
-
-# RIGHT
-assert result.success, f"Agent failed: {result.error}"
-output = result.output or {}
-approval = output.get("approval_decision")
-assert approval == "APPROVED", f"Expected APPROVED, got {approval}"
-```
-
-All fields:
- `success: bool` — Completed without exception (NOT goal achieved!)
- `output: dict` — Complete memory snapshot (may contain raw strings)
- `error: str | None` — Error message if failed
- `steps_executed: int` — Number of nodes executed
- `total_tokens: int` — Cumulative token usage
- `total_latency_ms: int` — Total execution time
- `path: list[str]` — Node IDs traversed (may repeat in feedback loops)
- `paused_at: str | None` — Node ID if paused
- `session_state: dict` — State for resuming
- `node_visit_counts: dict[str, int]` — Visit counts per node (feedback loop testing)
- `execution_quality: str` — "clean", "degraded", or "failed"
-
-### Test Count Guidance
-
-**Write 8-15 tests, not 30+**
-
- 2-3 tests per success criterion
- 1 happy path test
- 1 boundary/edge case test
- 1 error handling test (optional)
-
-Each real test costs ~3 seconds + LLM tokens. 12 tests = ~36 seconds, $0.12.
-
---
-
-## Test Patterns
-
-### Happy Path
-```python
-@pytest.mark.asyncio
-async def test_happy_path(runner, auto_responder, mock_mode):
-    """Test normal successful execution."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "python tutorials"})
-    finally:
-        await auto_responder.stop()
-    assert result.success, f"Agent failed: {result.error}"
-    output = result.output or {}
-    assert output.get("report"), "No report produced"
-```
-
-### Boundary Condition
-```python
-@pytest.mark.asyncio
-async def test_minimum_sources(runner, auto_responder, mock_mode):
-    """Test at minimum source threshold."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "niche topic"})
-    finally:
-        await auto_responder.stop()
-    assert result.success, f"Agent failed: {result.error}"
-    output = result.output or {}
-    sources = output.get("sources", [])
-    if isinstance(sources, list):
-        assert len(sources) >= 3, f"Expected >= 3 sources, got {len(sources)}"
-```
-
-### Error Handling
-```python
-@pytest.mark.asyncio
-async def test_empty_input(runner, auto_responder, mock_mode):
-    """Test graceful handling of empty input."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": ""})
-    finally:
-        await auto_responder.stop()
-    # Agent should either fail gracefully or produce an error message
-    output = result.output or {}
-    assert not result.success or output.get("error"), "Should handle empty input"
-```
-
-### Feedback Loop
-```python
-@pytest.mark.asyncio
-async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
-    """Test that feedback loops don't run forever."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "test"})
-    finally:
-        await auto_responder.stop()
-    visits = result.node_visit_counts or {}
-    for node_id, count in visits.items():
-        assert count <= 5, f"Node {node_id} visited {count} times — possible infinite loop"
-```
-
---
-
-## MCP Tool Reference
-
-### Phase 1: Test Generation
-
-```python
-# Check existing tests
-list_tests(goal_id, agent_path)
-
-# Get constraint test guidelines (returns templates, NOT generated tests)
-generate_constraint_tests(goal_id, goal_json, agent_path)
-# Returns: output_file, file_header, test_template, constraints_formatted, test_guidelines
-
-# Get success criteria test guidelines
-generate_success_tests(goal_id, goal_json, node_names, tool_names, agent_path)
-# Returns: output_file, file_header, test_template, success_criteria_formatted, test_guidelines
-```
-
-### Phase 2: Execution
-
-```python
-# Automated regression (no checkpoints, fresh runs)
-run_tests(goal_id, agent_path, test_types='["all"]', parallel=-1, fail_fast=False)
-
-# Run only specific test types
-run_tests(goal_id, agent_path, test_types='["constraint"]')
-run_tests(goal_id, agent_path, test_types='["success"]')
-```
-
-```bash
-# Iterative debugging with checkpoints (via CLI)
-uv run hive run exports/{agent_name} --input '{"query": "test"}'
-```
-
-### Phase 3: Analysis
-
-```python
-# Debug a specific failed test
-debug_test(goal_id, test_name, agent_path)
-
-# Find failed sessions
-list_agent_sessions(agent_work_dir, status="failed", limit=5)
-
-# Inspect session state (excludes memory values)
-get_agent_session_state(agent_work_dir, session_id)
-
-# Inspect memory data
-get_agent_session_memory(agent_work_dir, session_id, key="research_results")
-
-# Runtime logs: L1 summaries
-query_runtime_logs(agent_work_dir, status="needs_attention")
-
-# Runtime logs: L2 per-node details
-query_runtime_log_details(agent_work_dir, run_id, needs_attention_only=True)
-
-# Runtime logs: L3 tool/LLM raw data
-query_runtime_log_raw(agent_work_dir, run_id, node_id="research")
-
-# Find clean checkpoints
-list_agent_checkpoints(agent_work_dir, session_id, is_clean="true")
-
-# Compare checkpoints (memory diff)
-compare_agent_checkpoints(agent_work_dir, session_id, cp_before, cp_after)
-```
-
-### Phase 5: Recovery
-
-```python
-# Inspect checkpoint before resuming
-get_agent_checkpoint(agent_work_dir, session_id, checkpoint_id)
-# Empty checkpoint_id = latest checkpoint
-```
-
-```bash
-# Resume from checkpoint via CLI (headless)
-uv run hive run exports/{agent_name} \
-  --resume-session {session_id} --checkpoint {checkpoint_id}
-```
-
---
-
-## Anti-Patterns
-
-| Don't | Do Instead |
-|-------|-----------|
-| Use `default_agent.run()` in tests | Use `runner.run()` with `auto_responder` fixtures (goes through AgentRuntime) |
-| Re-run entire agent when a late node fails | Resume from last clean checkpoint |
-| Treat `result.success` as goal achieved | Check `result.output` for actual criteria |
-| Access `result.output["key"]` directly | Use `result.output.get("key")` |
-| Fix random things hoping tests pass | Analyze L2/L3 logs to find root cause first |
-| Write 30+ tests | Write 8-15 focused tests |
-| Skip credential check | Use `/hive-credentials` before testing |
-| Confuse `exports/` with `~/.hive/agents/` | Code in `exports/`, runtime data in `~/.hive/` |
-| Use `run_tests` for iterative debugging | Use headless CLI with checkpoints for iterative debugging |
-| Use headless CLI for final regression | Use `run_tests` for automated regression |
-| Use `--tui` from Claude Code | Use headless `run` command — TUI hangs in non-interactive shells |
-| Test client-facing nodes from Claude Code | Use mock mode, or have the user run the agent in their terminal |
-| Run tests without reading goal first | Always understand the goal before writing tests |
-| Skip Phase 3 analysis and guess | Use session + log tools to identify root cause |
-
---
-
-## Example Walkthrough: Deep Research Agent
-
-A complete iteration showing the test loop for an agent with nodes: `intake → research → review → report`.
-
-### Phase 1: Generate tests
-
-```python
-# Read the goal
-Read(file_path="exports/deep_research_agent/agent.py")
-
-# Get success criteria test guidelines
-result = generate_success_tests(
-    goal_id="rigorous-interactive-research",
-    goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "target": ">=5"}, {"id": "citation-coverage", "target": "100%"}, {"id": "report-completeness", "target": "90%"}]}',
-    node_names="intake,research,review,report",
-    tool_names="web_search,web_scrape",
-    agent_path="exports/deep_research_agent"
-)
-
-# Write tests
-Write(
-    file_path=result["output_file"],
-    content=result["file_header"] + "\n\n" + test_code
-)
-```
-
-### Phase 2: First execution
-
-```python
-run_tests(
-    goal_id="rigorous-interactive-research",
-    agent_path="exports/deep_research_agent",
-    fail_fast=True
-)
-```
-
-Result: `test_success_source_diversity` fails — agent only found 2 sources instead of 5.
-
-### Phase 3: Analyze
-
-```python
-# Debug the failing test
-debug_test(
-    goal_id="rigorous-interactive-research",
-    test_name="test_success_source_diversity",
-    agent_path="exports/deep_research_agent"
-)
-# → ASSERTION_FAILURE: Expected >= 5 sources, got 2
-
-# Find the session
-list_agent_sessions(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    status="completed",
-    limit=1
-)
-# → session_20260209_150000_abc12345
-
-# See what the research node produced
-get_agent_session_memory(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    session_id="session_20260209_150000_abc12345",
-    key="research_results"
-)
-# → Only 2 web_search calls made, each returned 1 source
-
-# Check the LLM's behavior in the research node
-query_runtime_log_raw(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    run_id="session_20260209_150000_abc12345",
-    node_id="research"
-)
-# → LLM called web_search only twice, then called set_output
-```
-
-Root cause: The research node's prompt doesn't tell the LLM to search for at least 5 diverse sources. It stops after the first couple of searches.
-
-### Phase 4: Fix the prompt
-
-```python
-Read(file_path="exports/deep_research_agent/nodes/__init__.py")
-
-Edit(
-    file_path="exports/deep_research_agent/nodes/__init__.py",
-    old_string='system_prompt="Search for information on the user\'s topic."',
-    new_string='system_prompt="Search for information on the user\'s topic. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries to ensure source diversity. Do not stop searching until you have at least 5 distinct sources."'
-)
-```
-
-### Phase 5: Resume from checkpoint
-
-For this example, the fix is to the `research` node. If we had run via CLI with checkpointing, we could resume from the checkpoint after `intake` to skip re-running intake:
-
-```bash
-# Check if clean checkpoint exists after intake
-list_agent_checkpoints(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    session_id="session_20260209_150000_abc12345",
-    is_clean="true"
-)
-# → cp_node_complete_intake_150005
-
-# Resume from after intake, re-run research with fixed prompt
-uv run hive run exports/deep_research_agent \
-  --resume-session session_20260209_150000_abc12345 \
-  --checkpoint cp_node_complete_intake_150005
-```
-
-Or for this simple case (intake is fast), just re-run:
-
-```bash
-uv run hive run exports/deep_research_agent --input '{"topic": "test"}'
-```
-
-### Phase 6: Final verification
-
-```python
-run_tests(
-    goal_id="rigorous-interactive-research",
-    agent_path="exports/deep_research_agent"
-)
-# → All 12 tests pass
-```
-
---
-
-## Test File Structure
-
-```
-exports/{agent_name}/
-├── agent.py              ← Agent to test (goal, nodes, edges)
-├── nodes/__init__.py     ← Node implementations (prompts, config)
-├── config.py             ← Agent configuration
-├── mcp_servers.json      ← Tool server config
-└── tests/
-    ├── conftest.py           ← Shared fixtures + safe access helpers
-    ├── test_constraints.py   ← Constraint tests
-    ├── test_success_criteria.py  ← Success criteria tests
-    └── test_edge_cases.py    ← Edge case tests
-```
-
-## Integration with Other Skills
-
-| Scenario | From | To | Action |
-|----------|------|----|--------|
-| Agent built, ready to test | `/hive-create` | `/hive-test` | Generate tests, start loop |
-| Prompt fix needed | `/hive-test` Phase 4 | Direct edit | Edit `nodes/__init__.py`, resume |
-| Goal definition wrong | `/hive-test` Phase 4 | `/hive-create` | Update goal, may need rebuild |
-| Missing credentials | `/hive-test` Phase 3 | `/hive-credentials` | Set up credentials |
-| Complex runtime failure | `/hive-test` Phase 3 | `/hive-debugger` | Deep L1/L2/L3 analysis |
-| All tests pass | `/hive-test` Phase 6 | Done | Agent validated |
@@ -1,333 +0,0 @@
-# Example: Iterative Testing of a Research Agent
-
-This example walks through the full iterative test loop for a research agent that searches the web, reviews findings, and produces a cited report.
-
-## Agent Structure
-
-```
-exports/deep_research_agent/
-├── agent.py          # Goal + graph: intake → research → review → report
-├── nodes/__init__.py # Node definitions (system_prompt, input/output keys)
-├── config.py         # Model config
-├── mcp_servers.json  # Tools: web_search, web_scrape
-└── tests/            # Test files (we'll create these)
-```
-
-**Goal:** "Rigorous Interactive Research" — find 5+ diverse sources, cite every claim, produce a complete report.
-
---
-
-## Phase 1: Generate Tests
-
-### Read the goal
-
-```python
-Read(file_path="exports/deep_research_agent/agent.py")
-# Extract: goal_id="rigorous-interactive-research"
-# success_criteria: source-diversity (>=5), citation-coverage (100%), report-completeness (90%)
-# constraints: no-hallucination, source-attribution
-```
-
-### Get test guidelines
-
-```python
-result = generate_success_tests(
-    goal_id="rigorous-interactive-research",
-    goal_json='{"id": "rigorous-interactive-research", "success_criteria": [{"id": "source-diversity", "description": "Use multiple diverse sources", "target": ">=5"}, {"id": "citation-coverage", "description": "Every claim cites its source", "target": "100%"}, {"id": "report-completeness", "description": "Report answers the research questions", "target": "90%"}]}',
-    node_names="intake,research,review,report",
-    tool_names="web_search,web_scrape",
-    agent_path="exports/deep_research_agent"
-)
-```
-
-### Write tests
-
-```python
-Write(
-    file_path="exports/deep_research_agent/tests/test_success_criteria.py",
-    content=result["file_header"] + '''
-
-@pytest.mark.asyncio
-async def test_success_source_diversity(runner, auto_responder, mock_mode):
-    """At least 5 diverse sources are found."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "impact of remote work on productivity"})
-    finally:
-        await auto_responder.stop()
-    assert result.success, f"Agent failed: {result.error}"
-    output = result.output or {}
-    sources = output.get("sources", [])
-    if isinstance(sources, list):
-        assert len(sources) >= 5, f"Expected >= 5 sources, got {len(sources)}"
-
-@pytest.mark.asyncio
-async def test_success_citation_coverage(runner, auto_responder, mock_mode):
-    """Every factual claim in the report cites its source."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "climate change effects on agriculture"})
-    finally:
-        await auto_responder.stop()
-    assert result.success, f"Agent failed: {result.error}"
-    output = result.output or {}
-    report = output.get("report", "")
-    # Check that report contains numbered references
-    assert "[1]" in str(report) or "[source" in str(report).lower(), "Report lacks citations"
-
-@pytest.mark.asyncio
-async def test_success_report_completeness(runner, auto_responder, mock_mode):
-    """Report addresses the original research question."""
-    query = "pros and cons of nuclear energy"
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": query})
-    finally:
-        await auto_responder.stop()
-    assert result.success, f"Agent failed: {result.error}"
-    output = result.output or {}
-    report = output.get("report", "")
-    assert len(str(report)) > 200, f"Report too short: {len(str(report))} chars"
-
-@pytest.mark.asyncio
-async def test_empty_query_handling(runner, auto_responder, mock_mode):
-    """Agent handles empty input gracefully."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": ""})
-    finally:
-        await auto_responder.stop()
-    output = result.output or {}
-    assert not result.success or output.get("error"), "Should handle empty query"
-
-@pytest.mark.asyncio
-async def test_feedback_loop_terminates(runner, auto_responder, mock_mode):
-    """Feedback loop between review and research terminates."""
-    await auto_responder.start()
-    try:
-        result = await runner.run({"query": "quantum computing basics"})
-    finally:
-        await auto_responder.stop()
-    visits = result.node_visit_counts or {}
-    for node_id, count in visits.items():
-        assert count <= 5, f"Node {node_id} visited {count} times"
-'''
-)
-```
-
---
-
-## Phase 2: First Execution
-
-```python
-run_tests(
-    goal_id="rigorous-interactive-research",
-    agent_path="exports/deep_research_agent",
-    fail_fast=True
-)
-```
-
-**Result:**
-```json
-{
-  "overall_passed": false,
-  "summary": {"total": 5, "passed": 3, "failed": 2, "pass_rate": "60.0%"},
-  "failures": [
-    {"test_name": "test_success_source_diversity", "details": "AssertionError: Expected >= 5 sources, got 2"},
-    {"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
-  ]
-}
-```
-
---
-
-## Phase 3: Analyze (Iteration 1)
-
-### Debug the first failure
-
-```python
-debug_test(
-    goal_id="rigorous-interactive-research",
-    test_name="test_success_source_diversity",
-    agent_path="exports/deep_research_agent"
-)
-# Category: ASSERTION_FAILURE — Expected >= 5 sources, got 2
-```
-
-### Find the session and inspect memory
-
-```python
-list_agent_sessions(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    status="completed",
-    limit=1
-)
-# → session_20260209_150000_abc12345
-
-get_agent_session_memory(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    session_id="session_20260209_150000_abc12345",
-    key="research_results"
-)
-# → Only 2 sources found. LLM stopped searching after 2 queries.
-```
-
-### Check LLM behavior in the research node
-
-```python
-query_runtime_log_raw(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    run_id="session_20260209_150000_abc12345",
-    node_id="research"
-)
-# → LLM called web_search twice, got results, immediately called set_output.
-# → Prompt doesn't instruct it to find at least 5 sources.
-```
-
-**Root cause:** The research node's system_prompt doesn't specify minimum source requirements.
-
---
-
-## Phase 4: Fix (Iteration 1)
-
-```python
-Read(file_path="exports/deep_research_agent/nodes/__init__.py")
-
-# Fix the research node prompt
-Edit(
-    file_path="exports/deep_research_agent/nodes/__init__.py",
-    old_string='system_prompt="Search for information on the user\'s topic using web search."',
-    new_string='system_prompt="Search for information on the user\'s topic using web search. You MUST find at least 5 diverse, authoritative sources. Use multiple different search queries with varied keywords. Do NOT call set_output until you have gathered at least 5 distinct sources from different domains."'
-)
-```
-
---
-
-## Phase 5: Recover & Resume (Iteration 1)
-
-The fix is to the `research` node. Since this was a `run_tests` execution (no checkpoints), we re-run from scratch:
-
-```python
-run_tests(
-    goal_id="rigorous-interactive-research",
-    agent_path="exports/deep_research_agent",
-    fail_fast=True
-)
-```
-
-**Result:**
-```json
-{
-  "overall_passed": false,
-  "summary": {"total": 5, "passed": 4, "failed": 1, "pass_rate": "80.0%"},
-  "failures": [
-    {"test_name": "test_success_citation_coverage", "details": "AssertionError: Report lacks citations"}
-  ]
-}
-```
-
-Source diversity now passes. Citation coverage still fails.
-
---
-
-## Phase 3: Analyze (Iteration 2)
-
-```python
-debug_test(
-    goal_id="rigorous-interactive-research",
-    test_name="test_success_citation_coverage",
-    agent_path="exports/deep_research_agent"
-)
-# Category: ASSERTION_FAILURE — Report lacks citations
-
-# Check what the report node produced
-list_agent_sessions(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    status="completed",
-    limit=1
-)
-# → session_20260209_151500_def67890
-
-get_agent_session_memory(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    session_id="session_20260209_151500_def67890",
-    key="report"
-)
-# → Report text exists but uses no numbered references.
-# → Sources are in memory but report node doesn't cite them.
-```
-
-**Root cause:** The report node's prompt doesn't instruct the LLM to include numbered citations.
-
---
-
-## Phase 4: Fix (Iteration 2)
-
-```python
-Edit(
-    file_path="exports/deep_research_agent/nodes/__init__.py",
-    old_string='system_prompt="Write a comprehensive report based on the research findings."',
-    new_string='system_prompt="Write a comprehensive report based on the research findings. You MUST include numbered citations [1], [2], etc. for every factual claim. At the end, include a References section listing all sources with their URLs. Every claim must be traceable to a specific source."'
-)
-```
-
---
-
-## Phase 5: Resume (Iteration 2)
-
-The fix is to the `report` node (the last node). To demonstrate checkpoint recovery, run via CLI:
-
-```bash
-# Run via CLI to get checkpoints
-uv run hive run exports/deep_research_agent --input '{"topic": "climate change effects"}'
-
-# After it runs, find the clean checkpoint before report
-list_agent_checkpoints(
-    agent_work_dir="~/.hive/agents/deep_research_agent",
-    session_id="session_20260209_152000_ghi34567",
-    is_clean="true"
-)
-# → cp_node_complete_review_152100 (after review, before report)
-
-# Resume — skips intake, research, review entirely
-uv run hive run exports/deep_research_agent \
-  --resume-session session_20260209_152000_ghi34567 \
-  --checkpoint cp_node_complete_review_152100
-```
-
-Only the `report` node re-runs with the fixed prompt, using research data from the checkpoint.
-
---
-
-## Phase 6: Final Verification
-
-```python
-run_tests(
-    goal_id="rigorous-interactive-research",
-    agent_path="exports/deep_research_agent"
-)
-```
-
-**Result:**
-```json
-{
-  "overall_passed": true,
-  "summary": {"total": 5, "passed": 5, "failed": 0, "pass_rate": "100.0%"}
-}
-```
-
-All tests pass.
-
---
-
-## Summary
-
-| Iteration | Failure | Root Cause | Fix | Recovery |
-|-----------|---------|------------|-----|----------|
-| 1 | Source diversity (2 < 5) | Research prompt too vague | Added "at least 5 sources" to prompt | Re-run (no checkpoints) |
-| 2 | No citations in report | Report prompt lacks citation instructions | Added citation requirements | Checkpoint resume (skipped 3 nodes) |
-
-**Key takeaways:**
- Phase 3 analysis (session memory + L3 logs) identified root causes without guessing
- Checkpoint recovery in iteration 2 saved time by skipping 3 expensive nodes
- Final `run_tests` confirms all scenarios pass end-to-end
@@ -1,526 +0,0 @@
---
-name: hive
-description: Complete workflow for building, implementing, and testing goal-driven agents. Orchestrates hive-* skills. Use when starting a new agent project, unsure which skill to use, or need end-to-end guidance.
-license: Apache-2.0
-metadata:
-  author: hive
-  version: "2.0"
-  type: workflow-orchestrator
-  orchestrates:
-    - hive-concepts
-    - hive-create
-    - hive-patterns
-    - hive-test
-    - hive-credentials
-    - hive-debugger
---
-
-# Agent Development Workflow
-
-**THIS IS AN EXECUTABLE WORKFLOW. DO NOT explore the codebase or read source files. ROUTE to the correct skill IMMEDIATELY.**
-
-When this skill is loaded, **ALWAYS use the AskUserQuestion tool** to present options:
-
-```
-Use AskUserQuestion with these options:
- "Build a new agent" → Then invoke /hive-create
- "Test an existing agent" → Then invoke /hive-test
- "Learn agent concepts" → Then invoke /hive-concepts
- "Optimize agent design" → Then invoke /hive-patterns
- "Set up credentials" → Then invoke /hive-credentials
- "Debug a failing agent" → Then invoke /hive-debugger
- "Other" (please describe what you want to achieve)
-```
-
-**DO NOT:** Read source files, explore the codebase, search for code, or do any investigation before routing. The sub-skills handle all of that.
-
---
-
-Complete Standard Operating Procedure (SOP) for building production-ready goal-driven agents.
-
-## Overview
-
-This workflow orchestrates specialized skills to take you from initial concept to production-ready agent:
-
-1. **Understand Concepts** → `/hive-concepts` (optional)
-2. **Build Structure** → `/hive-create`
-3. **Optimize Design** → `/hive-patterns` (optional)
-4. **Setup Credentials** → `/hive-credentials` (if agent uses tools requiring API keys)
-5. **Test & Validate** → `/hive-test`
-6. **Debug Issues** → `/hive-debugger` (if agent fails at runtime)
-
-## When to Use This Workflow
-
-Use this meta-skill when:
- Starting a new agent from scratch
- Unclear which skill to use first
- Need end-to-end guidance for agent development
- Want consistent, repeatable agent builds
-
-**Skip this workflow** if:
- You only need to test an existing agent → use `/hive-test` directly
- You know exactly which phase you're in → use specific skill directly
-
-## Quick Decision Tree
-
-```
-"Need to understand agent concepts" → hive-concepts
-"Build a new agent" → hive-create
-"Optimize my agent design" → hive-patterns
-"Need client-facing nodes or feedback loops" → hive-patterns
-"Set up API keys for my agent" → hive-credentials
-"Test my agent" → hive-test
-"My agent is failing/stuck/has errors" → hive-debugger
-"Not sure what I need" → Read phases below, then decide
-"Agent has structure but needs implementation" → See agent directory STATUS.md
-```
-
-## Phase 0: Understand Concepts (Optional)
-
-**Skill**: `/hive-concepts`
-**Input**: Questions about agent architecture
-
-### When to Use
-
- First time building an agent
- Need to understand node types, edges, goals
- Want to validate tool availability
- Learning about event loop architecture and client-facing nodes
-
-### What This Phase Provides
-
- Architecture overview (Python packages, not JSON)
- Core concepts (Goal, Node, Edge, Event Loop, Judges)
- Tool discovery and validation procedures
- Workflow overview
-
-**Skip this phase** if you already understand agent fundamentals.
-
-## Phase 1: Build Agent Structure
-
-**Skill**: `/hive-create`
-**Input**: User requirements ("Build an agent that...") or a template to start from
-
-### What This Phase Does
-
-Creates the complete agent architecture:
- Package structure (`exports/agent_name/`)
- Goal with success criteria and constraints
- Workflow graph (nodes and edges)
- Node specifications
- CLI interface
- Documentation
-
-### Process
-
-1. **Create package** - Directory structure with skeleton files
-2. **Define goal** - Success criteria and constraints written to agent.py
-3. **Design nodes** - Each node approved and written incrementally
-4. **Connect edges** - Workflow graph with conditional routing
-5. **Finalize** - Agent class, exports, and documentation
-
-### Outputs
-
- ✅ `exports/agent_name/` package created
- ✅ Goal defined in agent.py
- ✅ 3-5 success criteria defined
- ✅ 1-5 constraints defined
- ✅ 5-10 nodes specified in nodes/__init__.py
- ✅ 8-15 edges connecting workflow
- ✅ Validated structure (passes `uv run python -m agent_name validate`)
- ✅ README.md with usage instructions
- ✅ CLI commands (info, validate, run, shell)
-
-### Success Criteria
-
-You're ready for Phase 2 when:
- Agent structure validates without errors
- All nodes and edges are defined
- CLI commands work (info, validate)
- You see: "Agent complete: exports/agent_name/"
-
-### Common Outputs
-
-The hive-create skill produces:
-```
-exports/agent_name/
-├── __init__.py          (package exports)
-├── __main__.py          (CLI interface)
-├── agent.py             (goal, graph, agent class)
-├── nodes/__init__.py    (node specifications)
-├── config.py            (configuration)
-├── implementations.py   (may be created for Python functions)
-└── README.md            (documentation)
-```
-
-### Next Steps
-
-**If structure complete and validated:**
-→ Check `exports/agent_name/STATUS.md` or `IMPLEMENTATION_GUIDE.md`
-→ These files explain implementation options
-→ You may need to add Python functions or MCP tools (not covered by current skills)
-
-**If want to optimize design:**
-→ Proceed to Phase 1.5 (hive-patterns)
-
-**If ready to test:**
-→ Proceed to Phase 2
-
-## Phase 1.5: Optimize Design (Optional)
-
-**Skill**: `/hive-patterns`
-**Input**: Completed agent structure
-
-### When to Use
-
- Want to add client-facing blocking or feedback edges
- Need judge patterns for output validation
- Want fan-out/fan-in (parallel execution)
- Need error handling patterns
- Want best practices guidance
-
-### What This Phase Provides
-
- Client-facing interaction patterns
- Feedback edge routing with nullable output keys
- Judge patterns (implicit, SchemaJudge)
- Fan-out/fan-in parallel execution
- Context management and spillover patterns
- Anti-patterns to avoid
-
-**Skip this phase** if your agent design is straightforward.
-
-## Phase 2: Test & Validate
-
-**Skill**: `/hive-test`
-**Input**: Working agent from Phase 1
-
-### What This Phase Does
-
-Guides the creation and execution of a comprehensive test suite:
- Constraint tests
- Success criteria tests
- Edge case tests
- Integration tests
-
-### Process
-
-1. **Analyze agent** - Read goal, constraints, success criteria
-2. **Generate tests** - The calling agent writes pytest files in `exports/agent_name/tests/` using hive-test guidelines and templates
-3. **User approval** - Review and approve each test
-4. **Run evaluation** - Execute tests and collect results
-5. **Debug failures** - Identify and fix issues
-6. **Iterate** - Repeat until all tests pass
-
-### Outputs
-
- ✅ Test files in `exports/agent_name/tests/`
- ✅ Test report with pass/fail metrics
- ✅ Coverage of all success criteria
- ✅ Coverage of all constraints
- ✅ Edge case handling verified
-
-### Success Criteria
-
-You're done when:
- All tests pass
- All success criteria validated
- All constraints verified
- Agent handles edge cases
- Test coverage is comprehensive
-
-### Next Steps
-
-**Agent ready for:**
- Production deployment
- Integration into larger systems
- Documentation and handoff
- Continuous monitoring
-
-## Phase Transitions
-
-### From Phase 1 to Phase 2
-
-**Trigger signals:**
- "Agent complete: exports/..."
- Structure validation passes
- README indicates implementation complete
-
-**Before proceeding:**
- Verify agent can be imported: `from exports.agent_name import default_agent`
- Check if implementation is needed (see STATUS.md or IMPLEMENTATION_GUIDE.md)
- Confirm agent executes without import errors
-
-### Skipping Phases
-
-**When to skip Phase 1:**
- Agent structure already exists
- Only need to add tests
- Modifying existing agent
-
-**When to skip Phase 2:**
- Prototyping or exploring
- Agent not production-bound
- Manual testing sufficient
-
-## Common Patterns
-
-### Pattern 1: Complete New Build (Simple)
-
-```
-User: "Build an agent that monitors files"
-→ Use /hive-create
-→ Agent structure created
-→ Use /hive-test
-→ Tests created and passing
-→ Done: Production-ready agent
-```
-
-### Pattern 1b: Complete New Build (With Learning)
-
-```
-User: "Build an agent (first time)"
-→ Use /hive-concepts (understand concepts)
-→ Use /hive-create (build structure)
-→ Use /hive-patterns (optimize design)
-→ Use /hive-test (validate)
-→ Done: Production-ready agent
-```
-
-### Pattern 1c: Build from Template
-
-```
-User: "Build an agent based on the deep research template"
-→ Use /hive-create
-→ Select "From a template" path
-→ Pick template, name new agent
-→ Review/modify goal, nodes, graph
-→ Agent exported with customizations
-→ Use /hive-test
-→ Done: Customized agent
-```
-
-### Pattern 2: Test Existing Agent
-
-```
-User: "Test my agent at exports/my_agent"
-→ Skip Phase 1
-→ Use /hive-test directly
-→ Tests created
-→ Done: Validated agent
-```
-
-### Pattern 3: Iterative Development
-
-```
-User: "Build an agent"
-→ Use /hive-create (Phase 1)
-→ Implementation needed (see STATUS.md)
-→ [User implements functions]
-→ Use /hive-test (Phase 2)
-→ Tests reveal bugs
-→ [Fix bugs manually]
-→ Re-run tests
-→ Done: Working agent
-```
-
-### Pattern 4: Agent with Review Loops and HITL Checkpoints
-
-```
-User: "Build an agent with human review and feedback loops"
-→ Use /hive-concepts (learn event loop, client-facing nodes)
-→ Use /hive-create (build structure with feedback edges)
-→ Use /hive-patterns (implement client-facing + feedback patterns)
-→ Use /hive-test (validate review flows and edge routing)
-→ Done: Agent with HITL checkpoints and review loops
-```
-
-## Skill Dependencies
-
-```
-hive (meta-skill)
-    │
-    ├── hive-concepts (foundational)
-    │   ├── Architecture concepts (event loop, judges)
-    │   ├── Node types (event_loop, function)
-    │   ├── Edge routing and priority
-    │   ├── Tool discovery procedures
-    │   └── Workflow overview
-    │
-    ├── hive-create (procedural)
-    │   ├── Creates package structure
-    │   ├── Defines goal
-    │   ├── Adds nodes (event_loop, function)
-    │   ├── Connects edges with priority routing
-    │   ├── Finalizes agent class
-    │   └── Requires: hive-concepts
-    │
-    ├── hive-patterns (reference)
-    │   ├── Client-facing interaction patterns
-    │   ├── Feedback edges and review loops
-    │   ├── Judge patterns (implicit, SchemaJudge)
-    │   ├── Fan-out/fan-in parallel execution
-    │   └── Context management and anti-patterns
-    │
-    ├── hive-credentials (utility)
-    │   ├── Detects missing credentials
-    │   ├── Offers auth method choices (Aden OAuth, direct API key)
-    │   ├── Stores securely in ~/.hive/credentials
-    │   └── Validates with health checks
-    │
-    ├── hive-test (validation)
-    │   ├── Reads agent goal
-    │   ├── Generates tests
-    │   ├── Runs evaluation
-    │   └── Reports results
-    │
-    └── hive-debugger (troubleshooting)
-        ├── Monitors runtime logs (L1/L2/L3)
-        ├── Identifies retry loops, tool failures
-        ├── Categorizes issues (10 categories)
-        └── Provides fix recommendations
-```
-
-## Troubleshooting
-
-### "Agent structure won't validate"
-
- Check node IDs match between nodes/__init__.py and agent.py
- Verify all edges reference valid node IDs
- Ensure entry_node exists in nodes list
- Run: `PYTHONPATH=exports uv run python -m agent_name validate`
-
-### "Agent has structure but won't run"
-
- Check for STATUS.md or IMPLEMENTATION_GUIDE.md in agent directory
- Implementation may be needed (Python functions or MCP tools)
- This is expected - hive-create creates structure, not implementation
- See implementation guide for completion options
-
-### "Tests are failing"
-
- Review test output for specific failures
- Check agent goal and success criteria
- Verify constraints are met
- Use `/hive-test` to debug and iterate
- Fix agent code and re-run tests
-
-### "Agent is failing at runtime"
-
- Use `/hive-debugger` to analyze runtime logs
- The debugger identifies retry loops, tool failures, and stalled execution
- Get actionable fix recommendations with code changes
- Monitor the agent in real-time during TUI sessions
-
-### "Not sure which phase I'm in"
-
-Run these checks:
-
-```bash
-# Check if agent structure exists
-ls exports/my_agent/agent.py
-
-# Check if it validates
-PYTHONPATH=exports uv run python -m my_agent validate
-
-# Check if tests exist
-ls exports/my_agent/tests/
-
-# If structure exists and validates → Phase 2 (testing)
-# If structure doesn't exist → Phase 1 (building)
-# If tests exist but failing → Debug phase
-```
-
-## Best Practices
-
-### For Phase 1 (Building)
-
-1. **Start with clear requirements** - Know what the agent should do
-2. **Define success criteria early** - Measurable goals drive design
-3. **Keep nodes focused** - One responsibility per node
-4. **Use descriptive names** - Node IDs should explain purpose
-5. **Validate incrementally** - Check structure after each major addition
-
-### For Phase 2 (Testing)
-
-1. **Test constraints first** - Hard requirements must pass
-2. **Mock external dependencies** - Use mock mode for LLMs/APIs
-3. **Cover edge cases** - Test failures, not just success paths
-4. **Iterate quickly** - Fix one test at a time
-5. **Document test patterns** - Future tests follow same structure
-
-### General Workflow
-
-1. **Use version control** - Git commit after each phase
-2. **Document decisions** - Update README with changes
-3. **Keep iterations small** - Build → Test → Fix → Repeat
-4. **Preserve working states** - Tag successful iterations
-5. **Learn from failures** - Failed tests reveal design issues
-
-## Exit Criteria
-
-You're done with the workflow when:
-
-✅ Agent structure validates
-✅ All tests pass
-✅ Success criteria met
-✅ Constraints verified
-✅ Documentation complete
-✅ Agent ready for deployment
-
-## Additional Resources
-
- **hive-concepts**: See `.claude/skills/hive-concepts/SKILL.md`
- **hive-create**: See `.claude/skills/hive-create/SKILL.md`
- **hive-patterns**: See `.claude/skills/hive-patterns/SKILL.md`
- **hive-test**: See `.claude/skills/hive-test/SKILL.md`
- **Agent framework docs**: See `core/README.md`
- **Example agents**: See `exports/` directory
-
-## Summary
-
-This workflow provides a proven path from concept to production-ready agent:
-
-1. **Learn** with `/hive-concepts` → Understand fundamentals (optional)
-2. **Build** with `/hive-create` → Get validated structure
-3. **Optimize** with `/hive-patterns` → Apply best practices (optional)
-4. **Configure** with `/hive-credentials` → Set up API keys (if needed)
-5. **Test** with `/hive-test` → Get verified functionality
-6. **Debug** with `/hive-debugger` → Fix runtime issues (if needed)
-
-The workflow is **flexible** - skip phases as needed, iterate freely, and adapt to your specific requirements. The goal is **production-ready agents** built with **consistent, repeatable processes**.
-
-## Skill Selection Guide
-
-**Choose hive-concepts when:**
- First time building agents
- Need to understand event loop architecture
- Validating tool availability
- Learning about node types, edges, and judges
-
-**Choose hive-create when:**
- Actually building an agent
- Have clear requirements
- Ready to write code
- Want step-by-step guidance
- Want to start from an existing template and customize it
-
-**Choose hive-patterns when:**
- Agent structure complete
- Need client-facing nodes or feedback edges
- Implementing review loops or fan-out/fan-in
- Want judge patterns or context management
- Want best practices
-
-**Choose hive-test when:**
- Agent structure complete
- Ready to validate functionality
- Need comprehensive test coverage
- Testing feedback loops, output keys, or fan-out
-
-**Choose hive-debugger when:**
- Agent is failing or stuck at runtime
- Seeing retry loops or escalations
- Tool calls are failing
- Need to understand why a node isn't completing
- Want real-time monitoring of agent execution
@@ -1,199 +0,0 @@
-# Example: File Monitor Agent
-
-This example shows the complete /hive workflow in action for building a file monitoring agent.
-
-## Initial Request
-
-```
-User: "Build an agent that monitors ~/Downloads and copies new files to ~/Documents"
-```
-
-## Phase 1: Building (20 minutes)
-
-### Step 1: Create Structure
-
-Agent invokes `/hive-create` skill and:
-
-1. Creates `exports/file_monitor_agent/` package
-2. Writes skeleton files (__init__.py, __main__.py, agent.py, etc.)
-
-**Output**: Package structure visible immediately
-
-### Step 2: Define Goal
-
-```python
-goal = Goal(
-    id="file-monitor-copy",
-    name="Automated File Monitor & Copy",
-    success_criteria=[
-        # 100% detection rate
-        # 100% copy success
-        # 100% conflict resolution
-        # >99% uptime
-    ],
-    constraints=[
-        # Preserve originals
-        # Handle errors gracefully
-        # Track state
-        # Respect permissions
-    ]
-)
-```
-
-**Output**: Goal written to agent.py
-
-### Step 3: Design Nodes
-
-7 nodes approved and written incrementally:
-
-1. `initialize-state` - Set up tracking
-2. `list-downloads` - Scan directory
-3. `identify-new-files` - Find new files
-4. `check-for-new-files` - Router
-5. `copy-files` - Copy with conflict resolution
-6. `update-state` - Mark as processed
-7. `wait-interval` - Sleep between cycles
-
-**Output**: All nodes in nodes/__init__.py
-
-### Step 4: Connect Edges
-
-8 edges connecting the workflow loop:
-
-```
-initialize → list → identify → check
-                                ↓  ↓
-                              copy  wait
-                                ↓    ↑
-                              update ↓
-                                ↓    ↓
-                              wait → list (loop)
-```
-
-**Output**: Edges written to agent.py
-
-### Step 5: Finalize
-
-```bash
-$ PYTHONPATH=exports uv run python -m file_monitor_agent validate
-✓ Agent is valid
-
-$ PYTHONPATH=exports uv run python -m file_monitor_agent info
-Agent: File Monitor & Copy Agent
-Nodes: 7
-Edges: 8
-```
-
-**Phase 1 Complete**: Structure validated ✅
-
-### Status After Phase 1
-
-```
-exports/file_monitor_agent/
-├── __init__.py          ✅ (exports)
-├── __main__.py          ✅ (CLI)
-├── agent.py             ✅ (goal, graph, agent class)
-├── nodes/__init__.py    ✅ (7 nodes)
-├── config.py            ✅ (configuration)
-├── implementations.py   ✅ (Python functions)
-├── README.md            ✅ (documentation)
-├── IMPLEMENTATION_GUIDE.md ✅ (next steps)
-└── STATUS.md            ✅ (current state)
-```
-
-**Note**: Implementation gap exists - data flow needs connection (covered in STATUS.md)
-
-## Phase 2: Testing (25 minutes)
-
-### Step 1: Analyze Agent
-
-Agent invokes `/hive-test` skill and:
-
-1. Reads goal from `exports/file_monitor_agent/agent.py`
-2. Identifies 4 success criteria to test
-3. Identifies 4 constraints to verify
-4. Plans test coverage
-
-### Step 2: Generate Tests
-
-Creates test files:
-
-```
-exports/file_monitor_agent/tests/
-├── conftest.py              (fixtures)
-├── test_constraints.py      (4 constraint tests)
-├── test_success_criteria.py (4 success tests)
-└── test_edge_cases.py       (error handling)
-```
-
-Tests approved incrementally by user.
-
-### Step 3: Run Tests
-
-```bash
-$ PYTHONPATH=exports uv run pytest exports/file_monitor_agent/tests/
-
-test_constraints.py::test_preserves_originals     PASSED
-test_constraints.py::test_handles_errors          PASSED
-test_constraints.py::test_tracks_state            PASSED
-test_constraints.py::test_respects_permissions    PASSED
-
-test_success_criteria.py::test_detects_all_files  PASSED
-test_success_criteria.py::test_copies_all_files   PASSED
-test_success_criteria.py::test_resolves_conflicts PASSED
-test_success_criteria.py::test_continuous_run     PASSED
-
-test_edge_cases.py::test_empty_directory          PASSED
-test_edge_cases.py::test_permission_denied        PASSED
-test_edge_cases.py::test_disk_full                PASSED
-test_edge_cases.py::test_large_files              PASSED
-
-========================== 12 passed in 3.42s ==========================
-```
-
-**Phase 2 Complete**: All tests pass ✅
-
-## Final Output
-
-**Production-Ready Agent:**
-
-```bash
-# Run the agent
-./RUN_AGENT.sh
-
-# Or manually
-PYTHONPATH=exports uv run python -m file_monitor_agent run
-```
-
-**Capabilities:**
- Monitors ~/Downloads continuously
- Copies new files to ~/Documents
- Resolves conflicts with timestamps
- Handles errors gracefully
- Tracks processed files
- Runs as background service
-
-**Total Time**: ~45 minutes from concept to production
-
-## Key Learnings
-
-1. **Incremental building** - Files written immediately, visible throughout
-2. **Validation early** - Structure validated before moving to implementation
-3. **Test-driven** - Tests reveal real behavior
-4. **Documentation included** - README, STATUS, and guides auto-generated
-5. **Repeatable process** - Same workflow for any agent type
-
-## Variations
-
-**For simpler agents:**
- Fewer nodes (3-5 instead of 7)
- Simpler workflow (linear instead of looping)
- Faster build time (10-15 minutes)
-
-**For complex agents:**
- More nodes (10-15+)
- Multiple subgraphs
- Pause/resume points for human-in-the-loop
- Longer build time (45-60 minutes)
-
-The workflow scales to your needs!
@@ -0,0 +1,225 @@
+# Integration Test Reporting Skill
+
+Run the Level 2 dummy agent integration test suite and produce a detailed HTML report with per-test input → outcome analysis.
+
+## Trigger
+
+User wants to run integration tests and see results:
+- `/test-reporting`
+- `/test-reporting test_component_queen_live.py`
+- `/test-reporting --all`
+
+## SOP: Running Tests
+
+### Step 1: Select Scope
+
+If the user provides a specific test file or pattern, use it. Otherwise run the full suite.
+
+```bash
+# Full suite
+cd core && echo "1" | uv run python tests/dummy_agents/run_all.py --interactive 2>&1
+
+# Specific file (requires manual provider setup)
+cd core && uv run python -c "
+import sys
+sys.path.insert(0, '.')
+from tests.dummy_agents.run_all import detect_available
+from tests.dummy_agents.conftest import set_llm_selection
+
+avail = detect_available()
+claude = [p for p in avail if 'Claude Code' in p['name']]
+if not claude:
+    avail_names = [p['name'] for p in avail]
+    raise RuntimeError(f'No Claude Code subscription. Available: {avail_names}')
+provider = claude[0]
+set_llm_selection(
+    model=provider['model'],
+    api_key=provider['api_key'],
+    extra_headers=provider.get('extra_headers'),
+    api_base=provider.get('api_base'),
+)
+
+import pytest
+sys.exit(pytest.main([
+    'tests/dummy_agents/TEST_FILE_HERE',
+    '-v', '--override-ini=asyncio_mode=auto', '--no-header', '--tb=long',
+    '--log-cli-level=WARNING', '--junitxml=/tmp/hive_test_results.xml',
+]))
+"
+```
+
+### Step 2: Collect Results
+
+After the test run completes, collect:
+1. **JUnit XML** from `--junitxml` output (if available)
+2. **stdout/stderr** from the run
+3. **Summary table** from `run_all.py` output (the Unicode table)
+
+### Step 3: Generate HTML Report
+
+Write the report to `/tmp/hive_integration_test_report.html`.
+
+The report MUST include these sections:
+
+#### Header
+- Run timestamp (ISO 8601)
+- Provider used (model name, source)
+- Total tests / passed / failed / skipped
+- Total wall-clock time
+- Overall verdict: PASS (all green) or FAIL (with count)
+
+#### Per-Test Table
+
+For EVERY test (not just failures), include a row with:
+
+| Column | Description |
+|--------|-------------|
+| Component | Test file grouping (e.g., `component_queen_live`) |
+| Test Name | Function name (e.g., `test_queen_starts_in_planning_without_worker`) |
+| Status | PASS / FAIL / SKIP / ERROR with color badge |
+| Duration | Wall-clock seconds |
+| What | One-line description of what the test verifies |
+| How | How it works (setup → action → assertion) |
+| Why | Why this test matters (what bug/behavior it catches) |
+| Input | The input data or configuration (graph spec, initial prompt, phase, etc.) |
+| Expected Outcome | What the test asserts |
+| Actual Outcome | What actually happened (PASS: matches expected / FAIL: actual vs expected) |
+| Failure Detail | For failures only: full traceback + diagnosis |
+
+#### What / How / Why Descriptions
+
+These MUST be derived from the test function's docstring and code. Read each test file to extract:
+- **What**: From the docstring first line
+- **How**: From the test body (what fixtures, what graph, what assertions)
+- **Why**: From the docstring body or "Why this matters" section in the test module
+
+Use these mappings for the component test files:
+
+```
+test_component_llm.py          → "LLM Provider" — streaming, tool calling, tokens
+test_component_tools.py        → "Tool Registry + MCP" — connection, execution
+test_component_event_loop.py   → "EventLoopNode" — iteration, output, stall
+test_component_edges.py        → "Edge Evaluation" — conditional, priority
+test_component_conversation.py → "Conversation Persistence" — storage, cursor
+test_component_escalation.py   → "Escalation Flow" — worker→queen signaling
+test_component_continuous.py   → "Continuous Mode" — conversation threading
+test_component_queen.py        → "Queen Phase (Unit)" — phase state, tools, events
+test_component_queen_live.py   → "Queen Phase (Live)" — real queen, real LLM
+test_component_queen_state_machine.py → "Queen State Machine" — edge cases, races
+test_component_worker_comms.py → "Worker Communication" — events, data flow
+test_component_strict_outcomes.py → "Strict Outcomes" — exact path, output, quality
+```
+
+#### HTML Template
+
+Use this structure:
+
+```html
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>Hive Integration Test Report — {timestamp}</title>
+<style>
+  :root { --pass: #22c55e; --fail: #ef4444; --skip: #f59e0b; --bg: #0f172a; --surface: #1e293b; --text: #e2e8f0; --muted: #94a3b8; --border: #334155; }
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: 'SF Mono', 'Fira Code', monospace; background: var(--bg); color: var(--text); padding: 2rem; line-height: 1.6; }
+  h1, h2, h3 { font-weight: 600; }
+  h1 { font-size: 1.5rem; margin-bottom: 1rem; }
+  h2 { font-size: 1.2rem; margin: 2rem 0 1rem; border-bottom: 1px solid var(--border); padding-bottom: 0.5rem; }
+  .summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
+  .card { background: var(--surface); padding: 1rem; border-radius: 8px; border: 1px solid var(--border); }
+  .card .label { color: var(--muted); font-size: 0.75rem; text-transform: uppercase; }
+  .card .value { font-size: 1.5rem; font-weight: 700; margin-top: 0.25rem; }
+  .card .value.pass { color: var(--pass); }
+  .card .value.fail { color: var(--fail); }
+  table { width: 100%; border-collapse: collapse; font-size: 0.8rem; }
+  th { background: var(--surface); position: sticky; top: 0; text-align: left; padding: 0.5rem; border-bottom: 2px solid var(--border); color: var(--muted); text-transform: uppercase; font-size: 0.7rem; }
+  td { padding: 0.5rem; border-bottom: 1px solid var(--border); vertical-align: top; }
+  tr:hover { background: rgba(255,255,255,0.03); }
+  .badge { display: inline-block; padding: 2px 8px; border-radius: 4px; font-size: 0.7rem; font-weight: 700; }
+  .badge.pass { background: rgba(34,197,94,0.2); color: var(--pass); }
+  .badge.fail { background: rgba(239,68,68,0.2); color: var(--fail); }
+  .badge.skip { background: rgba(245,158,11,0.2); color: var(--skip); }
+  .detail { background: #1a1a2e; padding: 0.75rem; border-radius: 4px; margin-top: 0.5rem; font-size: 0.75rem; white-space: pre-wrap; overflow-x: auto; max-height: 200px; overflow-y: auto; }
+  .component-header { background: var(--surface); padding: 0.75rem 0.5rem; font-weight: 600; font-size: 0.85rem; }
+  .meta { color: var(--muted); font-size: 0.75rem; }
+</style>
+</head>
+<body>
+<h1>Hive Integration Test Report</h1>
+<p class="meta">Generated: {timestamp} | Provider: {provider} | Duration: {duration}s</p>
+
+<div class="summary">
+  <div class="card"><div class="label">Total</div><div class="value">{total}</div></div>
+  <div class="card"><div class="label">Passed</div><div class="value pass">{passed}</div></div>
+  <div class="card"><div class="label">Failed</div><div class="value fail">{failed}</div></div>
+  <div class="card"><div class="label">Verdict</div><div class="value {verdict_class}">{verdict}</div></div>
+</div>
+
+<h2>Test Results</h2>
+<table>
+<thead>
+<tr>
+  <th>Component</th>
+  <th>Test</th>
+  <th>Status</th>
+  <th>Time</th>
+  <th>What</th>
+  <th>Input → Expected → Actual</th>
+</tr>
+</thead>
+<tbody>
+<!-- For each test: -->
+<tr>
+  <td>{component}</td>
+  <td>{test_name}</td>
+  <td><span class="badge {status_class}">{status}</span></td>
+  <td>{duration}s</td>
+  <td>{what_description}</td>
+  <td>
+    <strong>Input:</strong> {input_description}<br>
+    <strong>Expected:</strong> {expected_outcome}<br>
+    <strong>Actual:</strong> {actual_outcome}
+    <!-- If failed: -->
+    <div class="detail">{failure_traceback}</div>
+  </td>
+</tr>
+</tbody>
+</table>
+
+<h2>Failure Analysis</h2>
+<!-- Only if there are failures -->
+<p>For each failure, provide:</p>
+<ul>
+  <li><strong>Root cause:</strong> Why it failed</li>
+  <li><strong>Impact:</strong> What this means for the system</li>
+  <li><strong>Suggested fix:</strong> How to address it</li>
+</ul>
+
+</body>
+</html>
+```
+
+### Step 4: Output
+
+1. Write the HTML file to `/tmp/hive_integration_test_report.html`
+2. Print the file path so the user can open it
+3. Print a concise summary to the terminal:
+   ```
+   Test Report: /tmp/hive_integration_test_report.html
+   Result: 74/76 PASSED (2 failures)
+   Failures:
+     - parallel_merge::test_parallel_disjoint_output_keys
+     - worker::test_worker_timestamped_note_artifact
+   ```
+
+## Key Rules
+
+1. ALWAYS use `--junitxml` when running pytest to get structured results
+2. ALWAYS read the test source files to populate What/How/Why columns — do not guess
+3. For Input/Expected/Actual, extract from the test's graph spec, assertions, and result
+4. Color-code everything: green for pass, red for fail, amber for skip
+5. Include the full traceback for failures in a scrollable `<div class="detail">`
+6. Group tests by component (file name) with a visual separator
+7. The report must be self-contained HTML (no external CSS/JS dependencies)
@@ -1,7 +0,0 @@
-# Project-level Codex config for Hive.
-# Keep this file minimal: MCP connectivity + skill discovery.
-
-[mcp_servers.agent-builder]
-command = "uv"
-args = ["run", "--directory", "core", "-m", "framework.mcp.agent_builder_server"]
-cwd = "."
@@ -1,20 +0,0 @@
-{
-  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "core",
-      "env": {
-        "PYTHONPATH": "../tools/src"
-      }
-    },
-    "tools": {
-      "command": "python",
-      "args": ["mcp_server.py", "--stdio"],
-      "cwd": "tools",
-      "env": {
-        "PYTHONPATH": "src"
-      }
-    }
-  }
-}
@@ -1 +0,0 @@
-../../.claude/skills/hive
@@ -1 +0,0 @@
-../../.claude/skills/hive-concepts
@@ -1 +0,0 @@
-../../.claude/skills/hive-create
@@ -1 +0,0 @@
-../../.claude/skills/hive-credentials
@@ -1 +0,0 @@
-../../.claude/skills/hive-patterns
@@ -1 +0,0 @@
-../../.claude/skills/hive-test
@@ -0,0 +1,89 @@
+name: Integration Bounty
+description: A bounty task for the integration contribution program
+title: "[Bounty]: "
+labels: []
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Integration Bounty
+
+        This issue is part of the [Integration Bounty Program](../../docs/bounty-program/README.md).
+        **Claim this bounty** by commenting below — a maintainer will assign you within 24 hours.
+
+  - type: dropdown
+    id: bounty-type
+    attributes:
+      label: Bounty Type
+      options:
+        - "Test a Tool (20 pts)"
+        - "Write Docs (20 pts)"
+        - "Code Contribution (30 pts)"
+        - "New Integration (75 pts)"
+    validations:
+      required: true
+
+  - type: dropdown
+    id: difficulty
+    attributes:
+      label: Difficulty
+      options:
+        - Easy
+        - Medium
+        - Hard
+    validations:
+      required: true
+
+  - type: input
+    id: tool-name
+    attributes:
+      label: Tool Name
+      description: The integration this bounty targets (e.g., `airtable`, `salesforce`)
+      placeholder: e.g., airtable
+    validations:
+      required: true
+
+  - type: textarea
+    id: description
+    attributes:
+      label: Description
+      description: What needs to be done to complete this bounty.
+      placeholder: |
+        Describe the specific task, including:
+        - What the contributor needs to do
+        - Links to relevant files in the repo
+        - Any setup requirements (API keys, accounts, etc.)
+    validations:
+      required: true
+
+  - type: textarea
+    id: acceptance-criteria
+    attributes:
+      label: Acceptance Criteria
+      description: What "done" looks like. The PR or report must meet all criteria.
+      placeholder: |
+        - [ ] Criterion 1
+        - [ ] Criterion 2
+        - [ ] CI passes
+    validations:
+      required: true
+
+  - type: textarea
+    id: relevant-files
+    attributes:
+      label: Relevant Files
+      description: Links to tool directory, credential spec, health check file, etc.
+      placeholder: |
+        - Tool: `tools/src/aden_tools/tools/{tool_name}/`
+        - Credential spec: `tools/src/aden_tools/credentials/{category}.py`
+        - Health checks: `tools/src/aden_tools/credentials/health_check.py`
+
+  - type: textarea
+    id: resources
+    attributes:
+      label: Resources
+      description: Links to API docs, examples, or guides that will help the contributor.
+      placeholder: |
+        - [Building Tools Guide](../../tools/BUILDING_TOOLS.md)
+        - [Tool README Template](../../docs/bounty-program/templates/tool-readme-template.md)
+        - API docs: https://...
@@ -0,0 +1,78 @@
+name: Standard Bounty
+description: A bounty task for general framework contributions (not integration-specific)
+title: "[Bounty]: "
+labels: []
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Standard Bounty
+
+        This issue is part of the [Bounty Program](../../docs/bounty-program/README.md).
+        **Claim this bounty** by commenting below — a maintainer will assign you within 24 hours.
+
+  - type: dropdown
+    id: bounty-size
+    attributes:
+      label: Bounty Size
+      options:
+        - "Small (10 pts)"
+        - "Medium (30 pts)"
+        - "Large (75 pts)"
+        - "Extreme (150 pts)"
+    validations:
+      required: true
+
+  - type: dropdown
+    id: difficulty
+    attributes:
+      label: Difficulty
+      options:
+        - Easy
+        - Medium
+        - Hard
+    validations:
+      required: true
+
+  - type: textarea
+    id: description
+    attributes:
+      label: Description
+      description: What needs to be done to complete this bounty.
+      placeholder: |
+        Describe the specific task, including:
+        - What the contributor needs to do
+        - Links to relevant files in the repo
+        - Any context or motivation for the change
+    validations:
+      required: true
+
+  - type: textarea
+    id: acceptance-criteria
+    attributes:
+      label: Acceptance Criteria
+      description: What "done" looks like. The PR must meet all criteria.
+      placeholder: |
+        - [ ] Criterion 1
+        - [ ] Criterion 2
+        - [ ] CI passes
+    validations:
+      required: true
+
+  - type: textarea
+    id: relevant-files
+    attributes:
+      label: Relevant Files
+      description: Links to files or directories related to this bounty.
+      placeholder: |
+        - `path/to/file.py`
+        - `path/to/directory/`
+
+  - type: textarea
+    id: resources
+    attributes:
+      label: Resources
+      description: Links to docs, issues, or external references that will help.
+      placeholder: |
+        - Related issue: #XXXX
+        - Docs: https://...
@@ -0,0 +1,47 @@
+name: Bounty completed
+description: Awards points and notifies Discord when a bounty PR is merged
+
+on:
+  pull_request_target:
+    types: [closed]
+
+  workflow_dispatch:
+    inputs:
+      pr_number:
+        description: "PR number to process (for missed bounties)"
+        required: true
+        type: number
+
+jobs:
+  bounty-notify:
+    if: >
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.pull_request.merged == true &&
+       contains(join(github.event.pull_request.labels.*.name, ','), 'bounty:'))
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    permissions:
+      contents: read
+      pull-requests: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: latest
+
+      - name: Award XP and notify Discord
+        run: bun run scripts/bounty-tracker.ts notify
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }}
+          GITHUB_REPOSITORY_NAME: ${{ github.event.repository.name }}
+          DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_BOUNTY_WEBHOOK_URL }}
+          BOT_API_URL: ${{ secrets.BOT_API_URL }}
+          BOT_API_KEY: ${{ secrets.BOT_API_KEY }}
+          LURKR_API_KEY: ${{ secrets.LURKR_API_KEY }}
+          LURKR_GUILD_ID: ${{ secrets.LURKR_GUILD_ID }}
+          PR_NUMBER: ${{ inputs.pr_number || github.event.pull_request.number }}
@@ -5,7 +5,7 @@ on:
    branches: [main]
  pull_request:
    branches: [main]
-
+    
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
@@ -24,6 +24,8 @@ jobs:

      - name: Install uv
        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true

      - name: Install dependencies
        run: uv sync --project core --group dev
@@ -54,12 +56,14 @@ jobs:

      - name: Install uv
        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true

      - name: Install dependencies and run tests
+        working-directory: core
        run: |
-          cd core
          uv sync
-          uv run pytest tests/ -v
+          uv run pytest tests/ -v --ignore=tests/dummy_agents

  test-tools:
    name: Test Tools (${{ matrix.os }})
@@ -77,10 +81,12 @@ jobs:

      - name: Install uv
        uses: astral-sh/setup-uv@v4
+        with:
+          enable-cache: true

      - name: Install dependencies and run tests
+        working-directory: tools
        run: |
-          cd tools
          uv sync --extra dev
          uv run pytest tests/ -v

@@ -98,10 +104,12 @@ jobs:

      - name: Install uv
        uses: astral-sh/setup-uv@v4
-
+        with:
+          enable-cache: true
+            
      - name: Install dependencies
+        working-directory: core
        run: |
-          cd core
          uv sync

      - name: Validate exported agents
@@ -0,0 +1,54 @@
+# Closes PRs that still have the `pr-requirements-warning` label
+# after contributors were warned in pr-requirements.yml.
+name: PR Requirements Enforcement
+on:
+  schedule:
+    - cron: "0 0 * * *"   # runs every day once at midnight 
+jobs:
+  enforce:
+    name: Close PRs still failing contribution requirements
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      issues: write
+    steps:
+      - name: Close PRs still failing requirements
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const { owner, repo } = context.repo;
+            const prs = await github.paginate(github.rest.pulls.list, {
+              owner,
+              repo,
+              state: "open",
+              per_page: 100
+            });
+            for (const pr of prs) {
+              // Skip draft PRs — author may still be actively working toward compliance
+              if (pr.draft) continue;
+              const labels = pr.labels.map(l => l.name);
+              if (!labels.includes("pr-requirements-warning")) continue;
+              const gracePeriod = 24 * 60 * 60 * 1000;
+              const lastUpdated = new Date(pr.created_at);
+              const now = new Date();
+              if (now - lastUpdated < gracePeriod) {
+                console.log(`Skipping PR #${pr.number} — still within grace period`);
+                continue;
+              }
+              const prNumber = pr.number;
+              const prAuthor = pr.user.login;
+              await github.rest.issues.createComment({
+                owner,
+                repo,
+                issue_number: prNumber,
+                body: `Closing PR because the contribution requirements were not resolved within the 24-hour grace period.
+                If this was closed in error, feel free to reopen the PR after fixing the requirements.`
+              });
+              await github.rest.pulls.update({
+                owner,
+                repo,
+                pull_number: prNumber,
+                state: "closed"
+              });
+              console.log(`Closed PR #${prNumber} by ${prAuthor} (PR requirements were not met)`);
+            }
@@ -43,9 +43,10 @@ jobs:
            console.log(`  Found issue references: ${issueNumbers.length > 0 ? issueNumbers.join(', ') : 'none'}`);

            if (issueNumbers.length === 0) {
-              const message = `## PR Closed - Requirements Not Met
+              const message = `## PR Requirements Warning

-            This PR has been automatically closed because it doesn't meet the requirements.
+            This PR does not meet the contribution requirements.
+            If the issue is not fixed within ~24 hours, it may be automatically closed.

            **Missing:** No linked issue found.

@@ -67,14 +68,15 @@ jobs:

            **Why is this required?** See #472 for details.`;

-              const comments = await github.rest.issues.listComments({
+              const comments = await github.paginate(github.rest.issues.listComments, {
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number: prNumber,
+                per_page: 100,
              });

-              const botComment = comments.data.find(
-                (c) => c.user.type === 'Bot' && c.body.includes('PR Closed - Requirements Not Met')
+              const botComment = comments.find(
+                (c) => c.user.type === 'Bot' && c.body.includes('PR Requirements Warning')
              );

              if (!botComment) {
@@ -86,11 +88,11 @@ jobs:
                });
              }

-              await github.rest.pulls.update({
+              await github.rest.issues.addLabels({
                owner: context.repo.owner,
                repo: context.repo.repo,
-                pull_number: prNumber,
-                state: 'closed',
+                issue_number: prNumber,
+                labels: ['pr-requirements-warning'],
              });

              core.setFailed('PR must reference an issue');
@@ -132,9 +134,10 @@ jobs:
                `#${i.number} (assignees: ${i.assignees.length > 0 ? i.assignees.join(', ') : 'none'})`
              ).join(', ');

-              const message = `## PR Closed - Requirements Not Met
+              const message = `## PR Requirements Warning

-            This PR has been automatically closed because it doesn't meet the requirements.
+            This PR does not meet the contribution requirements.
+            If the issue is not fixed within ~24 hours, it may be automatically closed.

            **PR Author:** @${prAuthor}
            **Found issues:** ${issueList}
@@ -157,14 +160,15 @@ jobs:

            **Why is this required?** See #472 for details.`;

-              const comments = await github.rest.issues.listComments({
+              const comments = await github.paginate(github.rest.issues.listComments, {
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number: prNumber,
+                per_page: 100,
              });

-              const botComment = comments.data.find(
-                (c) => c.user.type === 'Bot' && c.body.includes('PR Closed - Requirements Not Met')
+              const botComment = comments.find(
+                (c) => c.user.type === 'Bot' && c.body.includes('PR Requirements Warning')
              );

              if (!botComment) {
@@ -176,14 +180,24 @@ jobs:
                });
              }

-              await github.rest.pulls.update({
+              await github.rest.issues.addLabels({
                owner: context.repo.owner,
                repo: context.repo.repo,
-                pull_number: prNumber,
-                state: 'closed',
+                issue_number: prNumber,
+                labels: ['pr-requirements-warning'],
              });

              core.setFailed('PR author must be assigned to the linked issue');
            } else {
              console.log(`PR requirements met! Issue #${issueWithAuthorAssigned} has ${prAuthor} as assignee.`);
-            }
+              try {
+                await github.rest.issues.removeLabel({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: prNumber,
+                  name: "pr-requirements-warning"
+                });
+              }catch (error){
+                //ignore if label doesn't exist
+              }
+            }
@@ -0,0 +1,42 @@
+name: Weekly bounty leaderboard
+description: Posts the integration bounty leaderboard to Discord every Monday
+
+on:
+  schedule:
+    # Every Monday at 9:00 UTC
+    - cron: "0 9 * * 1"
+  workflow_dispatch:
+    inputs:
+      since_date:
+        description: "Only count PRs merged after this date (YYYY-MM-DD). Leave empty for all-time."
+        required: false
+
+jobs:
+  leaderboard:
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    permissions:
+      contents: read
+      pull-requests: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: latest
+
+      - name: Post leaderboard to Discord
+        run: bun run scripts/bounty-tracker.ts leaderboard
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }}
+          GITHUB_REPOSITORY_NAME: ${{ github.event.repository.name }}
+          DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_BOUNTY_WEBHOOK_URL }}
+          BOT_API_URL: ${{ secrets.BOT_API_URL }}
+          BOT_API_KEY: ${{ secrets.BOT_API_KEY }}
+          LURKR_API_KEY: ${{ secrets.LURKR_API_KEY }}
+          LURKR_GUILD_ID: ${{ secrets.LURKR_GUILD_ID }}
+          SINCE_DATE: ${{ github.event.inputs.since_date || '' }}
@@ -13,6 +13,10 @@ out/
 .env
 .env.local
 .env.*.local
+.venv
+/venv
+tools/src/uv.lock
+

 # User configuration (copied from .example)
 config.yaml
@@ -66,13 +70,10 @@ tmp/
 temp/

 exports/*
-
-.agent-builder-sessions/*
+exports.old*
+artifacts/*

 .claude/settings.local.json
-.claude/skills/ship-it/
-
-.venv

 docs/github-issues/*
 core/tests/*dumps/*
@@ -80,3 +81,4 @@ core/tests/*dumps/*
 screenshots/*

 .gemini/*
+.coverage
@@ -0,0 +1,9 @@
+{"type": "connection", "event": "connect", "ts": "2026-04-04T01:10:38.245667+00:00", "profile": "default"}
+{"type": "connection", "event": "hello", "details": {"version": "1.0"}, "ts": "2026-04-04T01:10:38.247207+00:00", "profile": "default"}
+{"type": "connection", "event": "disconnect", "ts": "2026-04-04T01:11:57.148273+00:00", "profile": "default"}
+{"type": "connection", "event": "connect", "ts": "2026-04-04T01:12:09.162378+00:00", "profile": "default"}
+{"type": "connection", "event": "hello", "details": {"version": "1.0"}, "ts": "2026-04-04T01:12:09.163899+00:00", "profile": "default"}
+{"type": "connection", "event": "disconnect", "ts": "2026-04-04T01:15:12.826042+00:00", "profile": "default"}
+{"type": "connection", "event": "connect", "ts": "2026-04-04T01:15:30.842533+00:00", "profile": "default"}
+{"type": "connection", "event": "hello", "details": {"version": "1.0"}, "ts": "2026-04-04T01:15:30.845025+00:00", "profile": "default"}
+{"type": "tool_call", "tool": "browser_stop", "params": {"profile": "gcu-browser-worker:3"}, "result": {"ok": true, "status": "not_running", "profile": "gcu-browser-worker:3"}, "ok": true, "duration_ms": 0.01, "ts": "2026-04-04T01:29:04.294954+00:00", "profile": "default"}
@@ -1,9 +1,3 @@
 {
-  "mcpServers": {
-    "agent-builder": {
-      "command": "uv",
-      "args": ["run", "-m", "framework.mcp.agent_builder_server"],
-      "cwd": "core"
-    }
-  }
+  "mcpServers": {}
 }
@@ -1,30 +0,0 @@
-{
-  "mcpServers": {
-    "agent-builder": {
-      "command": "uv",
-      "args": [
-        "run",
-        "python",
-        "-m",
-        "framework.mcp.agent_builder_server"
-      ],
-      "cwd": "core",
-      "env": {
-        "PYTHONPATH": "../tools/src"
-      }
-    },
-    "tools": {
-      "command": "uv",
-      "args": [
-        "run",
-        "python",
-        "mcp_server.py",
-        "--stdio"
-      ],
-      "cwd": "tools",
-      "env": {
-        "PYTHONPATH": "src"
-      }
-    }
-  }
-}
@@ -1 +0,0 @@
-../../.claude/skills/hive
@@ -1 +0,0 @@
-../../.claude/skills/hive-concepts
@@ -1 +0,0 @@
-../../.claude/skills/hive-create
@@ -1 +0,0 @@
-../../.claude/skills/hive-credentials
@@ -1 +0,0 @@
-../../.claude/skills/hive-debugger
@@ -1 +0,0 @@
-../../.claude/skills/hive-patterns
@@ -1 +0,0 @@
-../../.claude/skills/hive-test
@@ -1 +0,0 @@
-../../.claude/skills/triage-issue
@@ -1,7 +0,0 @@
-{
-  "recommendations": [
-    "charliermarsh.ruff",
-    "editorconfig.editorconfig",
-    "ms-python.python"
-  ]
-}
@@ -2,10 +2,6 @@

 Shared agent instructions for this workspace.

-## Deprecations
-
- **TUI is deprecated.** The terminal UI (`hive tui`) is no longer maintained. Use the browser-based interface (`hive open`) instead.
-
 ## Coding Agent Notes

 - 
@@ -1,17 +1,149 @@
 # Release Notes

+## v0.7.1
+
+**Release Date:** March 13, 2026
+**Tag:** v0.7.1
+
+### Chrome-Native Browser Control
+
+v0.7.1 replaces Playwright with direct Chrome DevTools Protocol (CDP) integration. The GCU now launches the user's system Chrome via `open -n` on macOS, connects over CDP, and manages browser lifecycle end-to-end -- no extra browser binary required.
+
+---
+
+### Highlights
+
+#### System Chrome via CDP
+
+The entire GCU browser stack has been rewritten:
+
+- **Chrome finder & launcher** -- New `chrome_finder.py` discovers installed Chrome and `chrome_launcher.py` manages process lifecycle with `--remote-debugging-port`
+- **Coexist with user's browser** -- `open -n` on macOS launches a separate Chrome instance so the user's tabs stay untouched
+- **Dynamic viewport sizing** -- Viewport auto-sizes to the available display area, suppressing Chrome warning bars
+- **Orphan cleanup** -- Chrome processes are killed on GCU server shutdown to prevent leaks
+- **`--no-startup-window`** -- Chrome launches headlessly by default until a page is needed
+
+#### Per-Subagent Browser Isolation
+
+Each GCU subagent gets its own Chrome user-data directory, preventing cookie/session cross-contamination:
+
+- Unique browser profiles injected per subagent
+- Profiles cleaned up after top-level GCU node execution
+- Tab origin and age metadata tracked per subagent
+
+#### Dummy Agent Testing Framework
+
+A comprehensive test suite for validating agent graph patterns without LLM calls:
+
+- 8 test modules covering echo, pipeline, branch, parallel merge, retry, feedback loop, worker, and GCU subagent patterns
+- Shared fixtures and a `run_all.py` runner for CI integration
+- Subagent lifecycle tests
+
+---
+
+### What's New
+
+#### GCU Browser
+
+- **Switch from Playwright to system Chrome via CDP** -- Direct CDP connection replaces Playwright dependency. (@bryanadenhq)
+- **Chrome finder and launcher modules** -- `chrome_finder.py` and `chrome_launcher.py` for cross-platform Chrome discovery and process management. (@bryanadenhq)
+- **Dynamic viewport sizing** -- Auto-size viewport and suppress Chrome warning bar. (@bryanadenhq)
+- **Per-subagent browser profile isolation** -- Unique user-data directories per subagent with cleanup. (@bryanadenhq)
+- **Tab origin/age metadata** -- Track which subagent opened each tab and when. (@bryanadenhq)
+- **`browser_close_all` tool** -- Bulk tab cleanup for agents managing many pages. (@bryanadenhq)
+- **Auto-track popup pages** -- Popups are automatically captured and tracked. (@bryanadenhq)
+- **Auto-snapshot from browser interactions** -- Browser interaction tools return screenshots automatically. (@bryanadenhq)
+- **Kill orphaned Chrome processes** -- GCU server shutdown cleans up lingering Chrome instances. (@bryanadenhq)
+- **`--no-startup-window` Chrome flag** -- Prevent empty window on launch. (@bryanadenhq)
+- **Launch Chrome via `open -n` on macOS** -- Coexist with the user's running browser. (@bryanadenhq)
+
+#### Framework & Runtime
+
+- **Session resume fix for new agents** -- Correctly resume sessions when a new agent is loaded. (@bryanadenhq)
+- **Queen upsert fix** -- Prevent duplicate queen entries on session restore. (@bryanadenhq)
+- **Anchor worker monitoring to queen's session ID on cold-restore** -- Worker monitors reconnect to the correct queen after restart. (@bryanadenhq)
+- **Update meta.json when loading workers** -- Worker metadata stays in sync with runtime state. (@RichardTang-Aden)
+- **Generate worker MCP file correctly** -- Fix MCP config generation for spawned workers. (@RichardTang-Aden)
+- **Share event bus so tool events are visible to parent** -- Tool execution events propagate up to parent graphs. (@bryanadenhq)
+- **Subagent activity tracking in queen status** -- Queen instructions include live subagent status. (@bryanadenhq)
+- **GCU system prompt updates** -- Auto-snapshots, batching, popup tracking, and close_all guidance. (@bryanadenhq)
+
+#### Frontend
+
+- **Loading spinner in draft panel** -- Shows spinner during planning phase instead of blank panel. (@bryanadenhq)
+- **Fix credential modal errors** -- Modal no longer eats errors; banner stays visible. (@bryanadenhq)
+- **Fix credentials_required loop** -- Stop clearing the flag on modal close to prevent infinite re-prompting. (@bryanadenhq)
+- **Fix "Add tab" dropdown overflow** -- Dropdown no longer hidden when many agents are open. (@prasoonmhwr)
+
+#### Testing
+
+- **Dummy agent test framework** -- 8 test modules (echo, pipeline, branch, parallel merge, retry, feedback loop, worker, GCU subagent) with shared fixtures and CI runner. (@bryanadenhq)
+- **Subagent lifecycle tests** -- Validate subagent spawn and completion flows. (@bryanadenhq)
+
+#### Documentation & Infrastructure
+
+- **MCP integration PRD** -- Product requirements for MCP server registry. (@TimothyZhang7)
+- **Skills registry PRD** -- Product requirements for skill registry system. (@bryanadenhq)
+- **Bounty program updates** -- Standard bounty issue template and updated contributor guide. (@bryanadenhq)
+- **Windows quickstart** -- Add default context limit for PowerShell setup. (@bryanadenhq)
+- **Remove deprecated files** -- Clean up `setup_mcp.py`, `verify_mcp.py`, `antigravity-setup.md`, and `setup-antigravity-mcp.sh`. (@bryanadenhq)
+
+---
+
+### Bug Fixes
+
+- Fix credential modal eating errors and banner staying open
+- Stop clearing `credentials_required` on modal close to prevent infinite loop
+- Share event bus so tool events are visible to parent graph
+- Use lazy %-formatting in subagent completion log to avoid f-string in logger
+- Anchor worker monitoring to queen's session ID on cold-restore
+- Update meta.json when loading workers
+- Generate worker MCP file correctly
+- Fix "Add tab" dropdown partially hidden when creating multiple agents
+
+---
+
+### Community Contributors
+
+- **Prasoon Mahawar** (@prasoonmhwr) -- Fix UI overflow on agent tab dropdown
+- **Richard Tang** (@RichardTang-Aden) -- Worker MCP generation and meta.json fixes
+
+---
+
+### Upgrading
+
+```bash
+git pull origin main
+uv sync
+```
+
+The Playwright dependency is no longer required for GCU browser operations. Chrome must be installed on the host system.
+
+---
+
+## v0.7.0
+
+**Release Date:** March 5, 2026
+**Tag:** v0.7.0
+
+Session management refactor release.
+
+---
+
+## v0.5.1
+
 **Release Date:** February 18, 2026
 **Tag:** v0.5.1

-## The Hive Gets a Brain
+### The Hive Gets a Brain

 v0.5.1 is our most ambitious release yet. Hive agents can now **build other agents** -- the new Hive Coder meta-agent writes, tests, and fixes agent packages from natural language. The runtime grows multi-graph support so one session can orchestrate multiple agents simultaneously. The TUI gets a complete overhaul with an in-app agent picker, live streaming, and seamless escalation to the Coder. And we're now provider-agnostic: Claude Code subscriptions, OpenAI-compatible endpoints, and any LiteLLM-supported model work out of the box.

 ---

-## Highlights
+### Highlights

-### Hive Coder -- The Agent That Builds Agents
+#### Hive Coder -- The Agent That Builds Agents

 A native meta-agent that lives inside the framework at `core/framework/agents/hive_coder/`. Give it a natural-language specification and it produces a complete agent package -- goal definition, node prompts, edge routing, MCP tool wiring, tests, and all boilerplate files.

@@ -30,7 +162,7 @@ The Coder ships with:
 - **Coder Tools MCP server** -- file I/O, fuzzy-match editing, git snapshots, and sandboxed shell execution (`tools/coder_tools_server.py`)
 - **Test generation** -- structural tests for forever-alive agents that don't hang on `runner.run()`

-### Multi-Graph Agent Runtime
+#### Multi-Graph Agent Runtime

 `AgentRuntime` now supports loading, managing, and switching between multiple agent graphs within a single session. Six new lifecycle tools give agents (and the TUI) full control:

@@ -44,7 +176,7 @@ await runtime.add_graph("exports/deep_research_agent")

 The Hive Coder uses multi-graph internally -- when you escalate from a worker agent, the Coder loads as a separate graph while the worker stays alive in the background.

-### TUI Revamp
+#### TUI Revamp

 The Terminal UI gets a ground-up rebuild with five major additions:

@@ -54,7 +186,7 @@ The Terminal UI gets a ground-up rebuild with five major additions:
 - **PDF attachments** -- `/attach` and `/detach` commands with native OS file dialog (macOS, Linux, Windows)
 - **Multi-graph commands** -- `/graphs`, `/graph <id>`, `/load <path>`, `/unload <id>` for managing agent graphs in-session

-### Provider-Agnostic LLM Support
+#### Provider-Agnostic LLM Support

 Hive is no longer Anthropic-only. v0.5.1 adds first-class support for:

@@ -66,9 +198,9 @@ The quickstart script auto-detects Claude Code subscriptions and ZAI Code instal

 ---

-## What's New
+### What's New

-### Architecture & Runtime
+#### Architecture & Runtime

 - **Hive Coder meta-agent** -- Natural-language agent builder with reference docs, guardian watchdog, and `hive code` CLI command. (@TimothyZhang7)
 - **Multi-graph agent sessions** -- `add_graph`/`remove_graph` on AgentRuntime with 6 lifecycle tools (`load_agent`, `unload_agent`, `start_agent`, `restart_agent`, `list_agents`, `get_user_presence`). (@TimothyZhang7)
@@ -79,7 +211,7 @@ The quickstart script auto-detects Claude Code subscriptions and ZAI Code instal
 - **Pre-start confirmation prompt** -- Interactive prompt before agent execution allowing credential updates or abort. (@RichardTang-Aden)
 - **Event bus multi-graph support** -- `graph_id` on events, `filter_graph` on subscriptions, `ESCALATION_REQUESTED` event type, `exclude_own_graph` filter. (@TimothyZhang7)

-### TUI Improvements
+#### TUI Improvements

 - **In-app agent picker** (Ctrl+A) -- Tabbed modal for browsing agents with metadata badges (nodes, tools, sessions, tags). (@TimothyZhang7)
 - **Runtime-optional TUI startup** -- Launches without a pre-loaded agent, shows agent picker on startup. (@TimothyZhang7)
@@ -89,7 +221,7 @@ The quickstart script auto-detects Claude Code subscriptions and ZAI Code instal
 - **Multi-graph TUI commands** -- `/graphs`, `/graph <id>`, `/load <path>`, `/unload <id>`. (@TimothyZhang7)
 - **Agent Guardian watchdog** -- Event-driven monitor that catches secondary agent failures and triggers automatic remediation, with `--no-guardian` CLI flag. (@TimothyZhang7)

-### New Tool Integrations
+#### New Tool Integrations

 | Tool                   | Description                                                                                                                                                            | Contributor        |
 | ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
@@ -99,7 +231,7 @@ The quickstart script auto-detects Claude Code subscriptions and ZAI Code instal
 | **Google Docs**        | Document creation, reading, and editing with OAuth credential support                                                                                                  | @haliaeetusvocifer |
 | **Gmail enhancements** | Expanded mail operations for inbox management                                                                                                                          | @bryanadenhq       |

-### Infrastructure
+#### Infrastructure

 - **Default node type → `event_loop`** -- `NodeSpec.node_type` defaults to `"event_loop"` instead of `"llm_tool_use"`. (@TimothyZhang7)
 - **Default `max_node_visits` → 0 (unlimited)** -- Nodes default to unlimited visits, reducing friction for feedback loops and forever-alive agents. (@TimothyZhang7)
@@ -112,7 +244,7 @@ The quickstart script auto-detects Claude Code subscriptions and ZAI Code instal

 ---

-## Bug Fixes
+### Bug Fixes

 - Flush WIP accumulator outputs on cancel/failure so edge conditions see correct values on resume
 - Stall detection state preserved across resume (no more resets on checkpoint restore)
@@ -125,13 +257,13 @@ The quickstart script auto-detects Claude Code subscriptions and ZAI Code instal
 - Fix email agent version conflicts (@RichardTang-Aden)
 - Fix coder tool timeouts (120s for tests, 300s cap for commands)

-## Documentation
+### Documentation

 - Clarify installation and prevent root pip install misuse (@paarths-collab)

 ---

-## Agent Updates
+### Agent Updates

 - **Email Inbox Management** -- Consolidate `gmail_inbox_guardian` and `inbox_management` into a single unified agent with updated prompts and config. (@RichardTang-Aden, @bryanadenhq)
 - **Job Hunter** -- Updated node prompts, config, and agent metadata; added PDF resume selection. (@bryanadenhq)
@@ -141,7 +273,7 @@ The quickstart script auto-detects Claude Code subscriptions and ZAI Code instal

 ---

-## Breaking Changes
+### Breaking Changes

 - **Deprecated node types raise `RuntimeError`** -- `llm_tool_use`, `llm_generate`, `function`, `router`, `human_input` now fail instead of warning. Migrate to `event_loop`.
 - **`NodeSpec.node_type` defaults to `"event_loop"`** (was `"llm_tool_use"`)
@@ -150,7 +282,7 @@ The quickstart script auto-detects Claude Code subscriptions and ZAI Code instal

 ---

-## Community Contributors
+### Community Contributors

 A huge thank you to everyone who contributed to this release:

@@ -165,14 +297,14 @@ A huge thank you to everyone who contributed to this release:

 ---

-## Upgrading
+### Upgrading

 ```bash
 git pull origin main
 uv sync
 ```

-### Migration Guide
+#### Migration Guide

 If your agents use deprecated node types, update them:

@@ -196,12 +328,3 @@ hive code
 # Or from TUI -- press Ctrl+E to escalate
 hive tui
 ```
-
---
-
-## What's Next
-
- **Agent-to-agent communication** -- one agent's output triggers another agent's entry point
- **Cost visibility** -- detailed runtime log of LLM costs per node and per session
- **Persistent webhook subscriptions** -- survive agent restarts without re-registering
- **Remote agent deployment** -- run agents as long-lived services with HTTP APIs
@@ -1,27 +1,34 @@
-.PHONY: lint format check test install-hooks help frontend-install frontend-dev frontend-build
+.PHONY: lint format check test test-tools test-live test-all install-hooks help frontend-install frontend-dev frontend-build
+
+# ── Ensure uv is findable in Git Bash on Windows ──────────────────────────────
+# uv installs to ~/.local/bin on Windows/Linux/macOS. Git Bash may not include
+# this in PATH by default, so we prepend it here.
+export PATH := $(HOME)/.local/bin:$(PATH)
+
+# ── Targets ───────────────────────────────────────────────────────────────────

 help: ## Show this help
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
 		awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2}'

 lint: ## Run ruff linter and formatter (with auto-fix)
-	cd core && ruff check --fix .
-	cd tools && ruff check --fix .
-	cd core && ruff format .
-	cd tools && ruff format .
+	cd core && uv run ruff check --fix .
+	cd tools && uv run ruff check --fix .
+	cd core && uv run ruff format .
+	cd tools && uv run ruff format .

 format: ## Run ruff formatter
-	cd core && ruff format .
-	cd tools && ruff format .
+	cd core && uv run ruff format .
+	cd tools && uv run ruff format .

 check: ## Run all checks without modifying files (CI-safe)
-	cd core && ruff check .
-	cd tools && ruff check .
-	cd core && ruff format --check .
-	cd tools && ruff format --check .
+	cd core && uv run ruff check .
+	cd tools && uv run ruff check .
+	cd core && uv run ruff format --check .
+	cd tools && uv run ruff format --check .

 test: ## Run all tests (core + tools, excludes live)
-	cd core && uv run python -m pytest tests/ -v
+	cd core && uv run python -m pytest tests/ -v --ignore=tests/dummy_agents
 	cd tools && uv run python -m pytest -v

 test-tools: ## Run tool tests only (mocked, no credentials needed)
@@ -31,7 +38,7 @@ test-live: ## Run live integration tests (requires real API credentials)
 	cd tools && uv run python -m pytest -m live -s -o "addopts=" --log-cli-level=INFO

 test-all: ## Run everything including live tests
-	cd core && uv run python -m pytest tests/ -v
+	cd core && uv run python -m pytest tests/ -v --ignore=tests/dummy_agents
 	cd tools && uv run python -m pytest -v
 	cd tools && uv run python -m pytest -m live -s -o "addopts=" --log-cli-level=INFO

@@ -46,4 +53,4 @@ frontend-dev: ## Start frontend dev server
 	cd core/frontend && npm run dev

 frontend-build: ## Build frontend for production
-	cd core/frontend && npm run build
+	cd core/frontend && npm run build
@@ -1,5 +1,5 @@
 <p align="center">
-  <img width="100%" alt="Hive Banner" src="https://github.com/user-attachments/assets/a027429b-5d3c-4d34-88e4-0feaeaabbab3" />
+  <img width="100%" alt="Hive Banner" src="https://asset.acho.io/github/img/banner.gif" />
 </p>

 <p align="center">
@@ -23,11 +23,12 @@
 </p>

 <p align="center">
+  <img src="https://img.shields.io/badge/Agent_Harness-Runtime_Layer-ff6600?style=flat-square" alt="Agent Harness" />
  <img src="https://img.shields.io/badge/AI_Agents-Self--Improving-brightgreen?style=flat-square" alt="AI Agents" />
  <img src="https://img.shields.io/badge/Multi--Agent-Systems-blue?style=flat-square" alt="Multi-Agent" />
  <img src="https://img.shields.io/badge/Headless-Development-purple?style=flat-square" alt="Headless" />
  <img src="https://img.shields.io/badge/Human--in--the--Loop-orange?style=flat-square" alt="HITL" />
-  <img src="https://img.shields.io/badge/Production--Ready-red?style=flat-square" alt="Production" />
+  <img src="https://img.shields.io/badge/Browser-Use-red?style=flat-square" alt="Browser Use" />
 </p>
 <p align="center">
  <img src="https://img.shields.io/badge/OpenAI-supported-412991?style=flat-square&logo=openai" alt="OpenAI" />
@@ -35,37 +36,51 @@
  <img src="https://img.shields.io/badge/Google_Gemini-supported-4285F4?style=flat-square&logo=google" alt="Gemini" />
 </p>

+<p align="center"><em>The agent harness for production workloads — state management, failure recovery, observability, and human oversight so your agents actually run.</em></p>
+
 ## Overview

-Build autonomous, reliable, self-improving AI agents without hardcoding workflows. Define your goal through conversation with hive coding agent(queen), and the framework generates a node graph with dynamically created connection code. When things break, the framework captures failure data, evolves the agent through the coding agent, and redeploys. Built-in human-in-the-loop nodes, credential management, and real-time monitoring give you control without sacrificing adaptability.
+OpenHive is a zero-setup, model-agnostic execution harness that dynamically generates multi-agent topologies to tackle complex, long-running business workflows without requiring any orchestration boilerplate. By simply defining your objective, the runtime compiles a strict, graph-based execution DAG that safely coordinates specialized agents to execute concurrent tasks in parallel. Backed by persistent, role-based memory that intelligently evolves with your project's context, OpenHive ensures deterministic fault tolerance, deep state observability, and seamless asynchronous execution across whichever underlying LLMs you choose to plug in.
+
+## Features
+
+- ✅ Multi-Agent Coordination for parallel task execution 
+- ✅ Graph-based execution for recurring and complex processes 
+- ✅ Role-based memory that evolves with your projects 
+- ✅ Zero Setup - No technical configuration required
+- ✅ General Compute Use and Browser Use with Native Extension 
+- ✅ Custom Model Support

 Visit [adenhq.com](https://adenhq.com) for complete documentation, examples, and guides.

-[![Hive Demo](https://img.youtube.com/vi/XDOG9fOaLjU/maxresdefault.jpg)](https://www.youtube.com/watch?v=XDOG9fOaLjU)
+Visit [HoneyComb](http://honeycomb.open-hive.com/) to see what jobs are being automated by AI. It’s a stock market for jobs, driven by our community’s AI agent progress. You can long and short jobs (with no real money but compute token)based on how much you think a job is going to be replaced by AI.
+
+https://github.com/user-attachments/assets/bf10edc3-06ba-48b6-98ba-d069b15fb69d
+

 ## Who Is Hive For?

-Hive is designed for developers and teams who want to build **production-grade AI agents** without manually wiring complex workflows.
+Hive is the multi-agent harness layer for teams moving AI agents from prototype to production. Single agents like Openclaw and Cowork can finish personal jobs pretty well but lack the rigor to fulfil business processes. 

 Hive is a good fit if you:

 - Want AI agents that **execute real business processes**, not demos
- Need **fast or high volume agent execution** over open workflow
+- Need a **runtime that handles state, recovery, and parallel execution** at scale
 - Need **self-healing and adaptive agents** that improve over time
 - Require **human-in-the-loop control**, observability, and cost limits
- Plan to run agents in **production environments**
+- Plan to run agents in **production** where uptime, cost, and auditability matter

 Hive may not be the best fit if you’re only experimenting with simple agent chains or one-off scripts.

 ## When Should You Use Hive?

-Use Hive when you need:
+Use Hive when the bottleneck is no longer the model but the harness around it:

- Long-running, autonomous agents
- Strong guardrails, process, and controls
- Continuous improvement based on failures
- Multi-agent coordination
- A framework that evolves with your goals
+- Long-running agents that need **state persistence and crash recovery**
+- Production workloads requiring **cost enforcement, observability, and audit trails**
+- Agents that **self-heal** through failure capture and graph evolution
+- Multi-agent coordination with **session isolation and shared buffers**
+- A framework that **scales with model improvements** rather than fighting them

 ## Quick Links

@@ -73,7 +88,7 @@ Use Hive when you need:
 - **[Self-Hosting Guide](https://docs.adenhq.com/getting-started/quickstart)** - Deploy Hive on your infrastructure
 - **[Changelog](https://github.com/aden-hive/hive/releases)** - Latest updates and releases
 - **[Roadmap](docs/roadmap.md)** - Upcoming features and plans
- **[Report Issues](https://github.com/adenhq/hive/issues)** - Bug reports and feature requests
+- **[Report Issues](https://github.com/aden-hive/hive/issues)** - Bug reports and feature requests
 - **[Contributing](CONTRIBUTING.md)** - How to contribute and submit PRs

 ## Quick Start
@@ -84,7 +99,7 @@ Use Hive when you need:
 - An LLM provider that powers the agents
 - **ripgrep (optional, recommended on Windows):** The `search_files` tool uses ripgrep for faster file search. If not installed, a Python fallback is used. On Windows: `winget install BurntSushi.ripgrep` or `scoop install ripgrep`

-> **Note for Windows Users:** It is strongly recommended to use **WSL (Windows Subsystem for Linux)** or **Git Bash** to run this framework. Some core automation scripts may not execute correctly in standard Command Prompt or PowerShell.
+> **Windows Users:** Native Windows is supported via `quickstart.ps1` and `hive.ps1`. Run these in PowerShell 5.1+. WSL is also an option but not required.

 ### Installation

@@ -98,9 +113,11 @@ Use Hive when you need:
 git clone https://github.com/aden-hive/hive.git
 cd hive

-
-# Run quickstart setup
+# Run quickstart setup (macOS/Linux)
 ./quickstart.sh
+
+# Windows (PowerShell)
+.\quickstart.ps1
 ```

 This sets up:
@@ -108,54 +125,40 @@ This sets up:
 - **framework** - Core agent runtime and graph executor (in `core/.venv`)
 - **aden_tools** - MCP tools for agent capabilities (in `tools/.venv`)
 - **credential store** - Encrypted API key storage (`~/.hive/credentials`)
- **LLM provider** - Interactive default model configuration
+- **LLM provider** - Interactive default model configuration, including Hive LLM and OpenRouter
 - All required Python dependencies with `uv`

- At last, it will initiate the open hive interface in your browser
+- Finally, it will open the Hive interface in your browser

 > **Tip:** To reopen the dashboard later, run `hive open` from the project directory.

-<img width="2500" height="1214" alt="home-screen" src="https://github.com/user-attachments/assets/134d897f-5e75-4874-b00b-e0505f6b45c4" />
-
 ### Build Your First Agent

-Type the agent you want to build in the home input box
+Type the agent you want to build in the home input box. The queen is going to ask you questions and work out a solution with you.

 <img width="2500" height="1214" alt="Image" src="https://github.com/user-attachments/assets/1ce19141-a78b-46f5-8d64-dbf987e048f4" />

 ### Use Template Agents

-Click "Try a sample agent" and check the templates. You can run a templates directly or choose to build your version on top of the existing template.
+Click "Try a sample agent" and check the templates. You can run a template directly or choose to build your version on top of the existing template.

 ### Run Agents

-Now you can run an agent by selectiing the agent (either an existing agent or example agent). You can click the Run button on the top left, or talk to the queen agent and it can run the agent for you.
+Now you can run an agent by selecting the agent (either an existing agent or example agent). You can click the Run button on the top left, or talk to the queen agent and it can run the agent for you.

-<img width="2500" height="1214" alt="Image" src="https://github.com/user-attachments/assets/71c38206-2ad5-49aa-bde8-6698d0bc55f5" />
-
-## Features
-
- **Browser-Use** - Control the browser on your computer to achieve hard tasks
- **Parallel Execution** - Execute the generated graph in parallel. This way you can have multiple agent compelteing the jobs for you
- **[Goal-Driven Generation](docs/key_concepts/goals_outcome.md)** - Define objectives in natural language; the coding agent generates the agent graph and connection code to achieve them
- **[Adaptiveness](docs/key_concepts/evolution.md)** - Framework captures failures, calibrates according to the objectives, and evolves the agent graph
- **[Dynamic Node Connections](docs/key_concepts/graph.md)** - No predefined edges; connection code is generated by any capable LLM based on your goals
- **SDK-Wrapped Nodes** - Every node gets shared memory, local RLM memory, monitoring, tools, and LLM access out of the box
- **[Human-in-the-Loop](docs/key_concepts/graph.md#human-in-the-loop)** - Intervention nodes that pause execution for human input with configurable timeouts and escalation
- **Real-time Observability** - WebSocket streaming for live monitoring of agent execution, decisions, and node-to-node communication
- **Production-Ready** - Self-hostable, built for scale and reliability
+<img width="2549" height="1174" alt="Screenshot 2026-03-12 at 9 27 36 PM" src="https://github.com/user-attachments/assets/7c7d30fa-9ceb-4c23-95af-b1caa405547d" />

 ## Integration

 <a href="https://github.com/aden-hive/hive/tree/main/tools/src/aden_tools/tools"><img width="100%" alt="Integration" src="https://github.com/user-attachments/assets/a1573f93-cf02-4bb8-b3d5-b305b05b1e51" /></a>
 Hive is built to be model-agnostic and system-agnostic.

- **LLM flexibility** - Hive Framework is designed to support various types of LLMs, including hosted and local models through LiteLLM-compatible providers.
+- **LLM flexibility** - Hive Framework supports Anthropic, OpenAI, OpenRouter, Hive LLM, and other hosted or local models through LiteLLM-compatible providers.
 - **Business system connectivity** - Hive Framework is designed to connect to all kinds of business systems as tools, such as CRM, support, messaging, data, file, and internal APIs via MCP.

-## Why Aden
+## Why Hive

-Hive focuses on generating agents that run real business processes rather than generic agents. Instead of requiring you to manually design workflows, define agent interactions, and handle failures reactively, Hive flips the paradigm: **you describe outcomes, and the system builds itself**—delivering an outcome-driven, adaptive experience with an easy-to-use set of tools and integrations.
+As models improve, the upper bound of what agents can do rises — but their reliability and production value are determined by the harness. Hive focuses on generating agents that run real business processes rather than generic agents. Instead of requiring you to manually design workflows, define agent interactions, and handle failures reactively, Hive flips the paradigm: **you describe outcomes, and the system builds itself**—delivering an outcome-driven, adaptive experience with an easy-to-use set of tools and integrations.

 ```mermaid
 flowchart LR
@@ -189,17 +192,6 @@ flowchart LR
    style V6 fill:#fff,stroke:#ed8c00,stroke-width:1px,color:#cc5d00
 ```

-### The Hive Advantage
-
-| Traditional Frameworks     | Hive                                   |
-| -------------------------- | -------------------------------------- |
-| Hardcode agent workflows   | Describe goals in natural language     |
-| Manual graph definition    | Auto-generated agent graphs            |
-| Reactive error handling    | Outcome-evaluation and adaptiveness    |
-| Static tool configurations | Dynamic SDK-wrapped nodes              |
-| Separate monitoring setup  | Built-in real-time observability       |
-| DIY budget management      | Integrated cost controls & degradation |
-
 ### How It Works

 1. **[Define Your Goal](docs/key_concepts/goals_outcome.md)** → Describe what you want to achieve in plain English
@@ -215,131 +207,6 @@ flowchart LR
 - [Configuration Guide](docs/configuration.md) - All configuration options
 - [Architecture Overview](docs/architecture/README.md) - System design and structure

-## Roadmap
-
-Aden Hive Agent Framework aims to help developers build outcome-oriented, self-adaptive agents. See [roadmap.md](docs/roadmap.md) for details.
-
-```mermaid
-flowchart TB
-    %% Main Entity
-    User([User])
-
-    %% =========================================
-    %% EXTERNAL EVENT SOURCES
-    %% =========================================
-    subgraph ExtEventSource [External Event Source]
-        E_Sch["Schedulers"]
-        E_WH["Webhook"]
-        E_SSE["SSE"]
-    end
-
-    %% =========================================
-    %% SYSTEM NODES
-    %% =========================================
-    subgraph WorkerBees [Worker Bees]
-        WB_C["Conversation"]
-        WB_SP["System prompt"]
-
-        subgraph Graph [Graph]
-            direction TB
-            N1["Node"] --> N2["Node"] --> N3["Node"]
-            N1 -.-> AN["Active Node"]
-            N2 -.-> AN
-            N3 -.-> AN
-
-            %% Nested Event Loop Node
-            subgraph EventLoopNode [Event Loop Node]
-                ELN_L["listener"]
-                ELN_SP["System Prompt<br/>(Task)"]
-                ELN_EL["Event loop"]
-                ELN_C["Conversation"]
-            end
-        end
-    end
-
-    subgraph JudgeNode [Judge]
-        J_C["Criteria"]
-        J_P["Principles"]
-        J_EL["Event loop"] <--> J_S["Scheduler"]
-    end
-
-    subgraph QueenBee [Queen Bee]
-        QB_SP["System prompt"]
-        QB_EL["Event loop"]
-        QB_C["Conversation"]
-    end
-
-    subgraph Infra [Infra]
-        SA["Sub Agent"]
-        TR["Tool Registry"]
-        WTM["Write through Conversation Memory<br/>(Logs/RAM/Harddrive)"]
-        SM["Shared Memory<br/>(State/Harddrive)"]
-        EB["Event Bus<br/>(RAM)"]
-        CS["Credential Store<br/>(Harddrive/Cloud)"]
-    end
-
-    subgraph PC [PC]
-        B["Browser"]
-        CB["Codebase<br/>v 0.0.x ... v n.n.n"]
-    end
-
-    %% =========================================
-    %% CONNECTIONS & DATA FLOW
-    %% =========================================
-
-    %% External Event Routing
-    E_Sch --> ELN_L
-    E_WH --> ELN_L
-    E_SSE --> ELN_L
-    ELN_L -->|"triggers"| ELN_EL
-
-    %% User Interactions
-    User -->|"Talk"| WB_C
-    User -->|"Talk"| QB_C
-    User -->|"Read/Write Access"| CS
-
-    %% Inter-System Logic
-    ELN_C <-->|"Mirror"| WB_C
-    WB_C -->|"Focus"| AN
-
-    WorkerBees -->|"Inquire"| JudgeNode
-    JudgeNode -->|"Approve"| WorkerBees
-
-    %% Judge Alignments
-    J_C <-.->|"aligns"| WB_SP
-    J_P <-.->|"aligns"| QB_SP
-
-    %% Escalate path
-    J_EL -->|"Report (Escalate)"| QB_EL
-
-    %% Pub/Sub Logic
-    AN -->|"publish"| EB
-    EB -->|"subscribe"| QB_C
-
-    %% Infra and Process Spawning
-    ELN_EL -->|"Spawn"| SA
-    SA -->|"Inform"| ELN_EL
-    SA -->|"Starts"| B
-    B -->|"Report"| ELN_EL
-    TR -->|"Assigned"| ELN_EL
-    CB -->|"Modify Worker Bee"| WB_C
-
-    %% =========================================
-    %% SHARED MEMORY & LOGS ACCESS
-    %% =========================================
-
-    %% Worker Bees Access (link to node inside Graph subgraph)
-    AN <-->|"Read/Write"| WTM
-    AN <-->|"Read/Write"| SM
-
-    %% Queen Bee Access
-    QB_C <-->|"Read/Write"| WTM
-    QB_EL <-->|"Read/Write"| SM
-
-    %% Credentials Access
-    CS -->|"Read Access"| QB_C
-```
-
 ## Contributing
 We welcome contributions from the community! We’re especially looking for help building tools, integrations, and example agents for the framework ([check #2805](https://github.com/aden-hive/hive/issues/2805)). If you’re interested in extending its functionality, this is the perfect place to start. Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.

@@ -378,7 +245,7 @@ This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENS

 **Q: What LLM providers does Hive support?**

-Hive supports 100+ LLM providers through LiteLLM integration, including OpenAI (GPT-4, GPT-4o), Anthropic (Claude models), Google Gemini, DeepSeek, Mistral, Groq, and many more. Simply set the appropriate API key environment variable and specify the model name. We recommend using Claude, GLM and Gemini as they have the best performance.
+Hive supports 100+ LLM providers through LiteLLM integration, including OpenAI (GPT-4, GPT-4o), Anthropic (Claude models), Google Gemini, DeepSeek, Mistral, Groq, OpenRouter, and Hive LLM. Simply set the appropriate API key environment variable and specify the model name. See [docs/configuration.md](docs/configuration.md) for provider-specific configuration examples.

 **Q: Can I use Hive with local AI models like Ollama?**

@@ -386,16 +253,12 @@ Yes! Hive supports local models through LiteLLM. Simply use the model name forma

 **Q: What makes Hive different from other agent frameworks?**

-Hive generates your entire agent system from natural language goals using a coding agent—you don't hardcode workflows or manually define graphs. When agents fail, the framework automatically captures failure data, [evolves the agent graph](docs/key_concepts/evolution.md), and redeploys. This self-improving loop is unique to Aden.
+Hive is an agent harness, not just an orchestration framework. It provides the production runtime layer — session isolation, checkpoint-based crash recovery, cost enforcement, real-time observability, and human-in-the-loop controls — that makes agents reliable enough to run real workloads. On top of that, Hive generates your entire agent system from natural language goals and automatically [evolves the graph](docs/key_concepts/evolution.md) when agents fail. The combination of a robust harness with self-improving generation is what sets Hive apart.

 **Q: Is Hive open-source?**

 Yes, Hive is fully open-source under the Apache License 2.0. We actively encourage community contributions and collaboration.

-**Q: Can Hive handle complex, production-scale use cases?**
-
-Yes. Hive is explicitly designed for production environments with features like automatic failure recovery, real-time observability, cost controls, and horizontal scaling support. The framework handles both simple automations and complex multi-agent workflows.
-
 **Q: Does Hive support human-in-the-loop workflows?**

 Yes, Hive fully supports [human-in-the-loop](docs/key_concepts/graph.md#human-in-the-loop) workflows through intervention nodes that pause execution for human input. These include configurable timeouts and escalation policies, allowing seamless collaboration between human experts and AI agents.
@@ -420,6 +283,16 @@ Visit [docs.adenhq.com](https://docs.adenhq.com/) for complete guides, API refer

 Contributions are welcome! Fork the repository, create your feature branch, implement your changes, and submit a pull request. See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.

+## Star History
+
+<a href="https://star-history.com/#aden-hive/hive&Date">
+ <picture>
+   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=aden-hive/hive&type=Date&theme=dark" />
+   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=aden-hive/hive&type=Date" />
+   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=aden-hive/hive&type=Date" />
+ </picture>
+</a>
+
 ---

 <p align="center">
@@ -39,8 +39,8 @@ We consider security research conducted in accordance with this policy to be:
 ## Security Best Practices for Users

 1. **Keep Updated**: Always run the latest version
-2. **Secure Configuration**: Review `config.yaml` settings, especially in production
-3. **Environment Variables**: Never commit `.env` files or `config.yaml` with secrets
+2. **Secure Configuration**: Review your `~/.hive/configuration.json`, `.mcp.json`, and environment variable settings, especially in production
+3. **Environment Variables**: Never commit `.env` files or any configuration files that contain secrets
 4. **Network Security**: Use HTTPS in production, configure firewalls appropriately
 5. **Database Security**: Use strong passwords, limit network access

@@ -1,31 +0,0 @@
-perf: reduce subprocess spawning in quickstart scripts (#4427)
-
-## Problem
-Windows process creation (CreateProcess) is 10-100x slower than Linux fork/exec.
-The quickstart scripts were spawning 4+ separate `uv run python -c "import X"` 
-processes to verify imports, adding ~600ms overhead on Windows.
-
-## Solution
-Consolidated all import checks into a single batch script that checks multiple 
-modules in one subprocess call, reducing spawn overhead by ~75%.
-
-## Changes
- **New**: `scripts/check_requirements.py` - Batched import checker
- **New**: `scripts/test_check_requirements.py` - Test suite  
- **New**: `scripts/benchmark_quickstart.ps1` - Performance benchmark tool
- **Modified**: `quickstart.ps1` - Updated import verification (2 sections)
- **Modified**: `quickstart.sh` - Updated import verification
-
-## Performance Impact
-**Benchmark results on Windows:**
- Before: ~19.8 seconds for import checks
- After: ~4.9 seconds for import checks
- **Improvement: 14.9 seconds saved (75.2% faster)**
-
-## Testing
- ✅ All functional tests pass (`scripts/test_check_requirements.py`)
- ✅ Quickstart scripts work correctly on Windows
- ✅ Error handling verified (invalid imports reported correctly)
- ✅ Performance benchmark confirms 75%+ improvement
-
-Fixes #4427
@@ -1,5 +1,4 @@
 exports/
 docs/
-.agent-builder-sessions/
 .pytest_cache/
 **/__pycache__/
@@ -1,10 +1,5 @@
 {
  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "core"
-    },
    "tools": {
      "command": "python",
      "args": ["-m", "aden_tools.mcp_server", "--stdio"],
@@ -6,7 +6,7 @@ This guide explains how to integrate Model Context Protocol (MCP) servers with t

 The framework provides built-in support for MCP servers, allowing you to:

- **Register MCP servers** via STDIO or HTTP transport
+- **Register MCP servers** via STDIO, HTTP, Unix socket, or SSE transport
 - **Auto-discover tools** from registered servers
 - **Use MCP tools** seamlessly in your agents
 - **Manage multiple MCP servers** simultaneously
@@ -104,6 +104,48 @@ runner.register_mcp_server(
 - `url`: Base URL of the MCP server
 - `headers`: HTTP headers to include (optional)

+### Unix Socket Transport
+
+Best for same-host inter-process communication with lower overhead than TCP:
+
+```python
+runner.register_mcp_server(
+    name="local-ipc-tools",
+    transport="unix",
+    url="http://localhost",
+    socket_path="/tmp/mcp_server.sock",
+    headers={
+        "Authorization": "Bearer token"
+    }
+)
+```
+
+**Configuration:**
+
+- `url`: Base URL for HTTP requests over the socket (required, e.g., `"http://localhost"`)
+- `socket_path`: Absolute path to the Unix socket file (required, e.g., `"/tmp/mcp_server.sock"`)
+- `headers`: HTTP headers to include (optional)
+
+### SSE Transport
+
+Best for real-time, event-driven connections using the MCP SDK's SSE client:
+
+```python
+runner.register_mcp_server(
+    name="streaming-tools",
+    transport="sse",
+    url="http://localhost:8000/sse",
+    headers={
+        "Authorization": "Bearer token"
+    }
+)
+```
+
+**Configuration:**
+
+- `url`: SSE endpoint URL (required, e.g., `"http://localhost:8000/sse"`)
+- `headers`: HTTP headers for the SSE connection (optional)
+
 ## Using MCP Tools in Agents

 Once registered, MCP tools are available just like any other tool:
@@ -258,7 +300,32 @@ runner.register_mcp_server(
 )
 ```

-### 3. Handle Cleanup
+### 3. Use Unix Socket for Same-Host IPC
+
+When both the agent and MCP server run on the same machine, Unix sockets avoid TCP overhead:
+
+```python
+runner.register_mcp_server(
+    name="fast-local-tools",
+    transport="unix",
+    url="http://localhost",
+    socket_path="/tmp/mcp_server.sock"
+)
+```
+
+### 4. Use SSE for Streaming and Real-Time Tools
+
+SSE transport maintains a persistent connection, ideal for event-driven servers:
+
+```python
+runner.register_mcp_server(
+    name="realtime-tools",
+    transport="sse",
+    url="http://realtime-server:8000/sse"
+)
+```
+
+### 5. Handle Cleanup

 Always clean up MCP connections when done:

@@ -280,7 +347,7 @@ async with AgentRunner.load("exports/my-agent") as runner:
    # Automatic cleanup
 ```

-### 4. Tool Name Conflicts
+### 6. Tool Name Conflicts

 If multiple MCP servers provide tools with the same name, the last registered server wins. To avoid conflicts:

@@ -315,6 +382,24 @@ If HTTP transport fails:
 2. Check firewall settings
 3. Verify the URL and port are correct

+### Unix Socket Not Connecting
+
+If Unix socket transport fails:
+
+1. Verify the socket file exists: `ls -la /tmp/mcp_server.sock`
+2. Check file permissions on the socket
+3. Ensure no other process has locked the socket
+4. Verify the `url` field is set (e.g., `"http://localhost"`)
+
+### SSE Connection Issues
+
+If SSE transport fails:
+
+1. Verify the server supports SSE at the given URL
+2. Check that the `mcp` Python package is installed (`pip install mcp`)
+3. Ensure the SSE endpoint is accessible: `curl http://localhost:8000/sse`
+4. Check for firewall or proxy issues blocking long-lived connections
+
 ## Example: Full Agent with MCP Tools

 Here's a complete example of an agent that uses MCP tools:
@@ -1,17 +1,16 @@
-# MCP Server Guide - Agent Builder
+# MCP Server Guide - Agent Building Tools

-This guide covers the MCP (Model Context Protocol) server for building goal-driven agents.
+> **Note:** The standalone `agent-builder` MCP server (`framework.mcp.agent_builder_server`) has been replaced. Agent building is now done via the `coder-tools` server's `initialize_and_build_agent` tool, with underlying logic in `tools/coder_tools_server.py`.
+
+This guide covers the MCP tools available for building goal-driven agents.

 ## Setup

 ### Quick Setup

 ```bash
-# Using the setup script (recommended)
-python setup_mcp.py
-
-# Or using bash
-./setup_mcp.sh
+# Run the quickstart script (recommended)
+./quickstart.sh
 ```

 ### Manual Configuration
@@ -21,10 +20,10 @@ Add to your MCP client configuration (e.g., Claude Desktop):
 ```json
 {
  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "/path/to/goal-agent"
+    "coder-tools": {
+      "command": "uv",
+      "args": ["run", "coder_tools_server.py", "--stdio"],
+      "cwd": "/path/to/hive/tools"
    }
  }
 }
@@ -17,66 +17,11 @@ Framework provides a runtime framework that captures **decisions**, not just act
 uv pip install -e .
 ```

-## MCP Server Setup
+## Agent Building

-The framework includes an MCP (Model Context Protocol) server for building agents. To set up the MCP server:
+Agent scaffolding is handled by the `coder-tools` MCP server (in `tools/coder_tools_server.py`), which provides the `initialize_and_build_agent` tool and related utilities. The package generation logic lives directly in `tools/coder_tools_server.py`.

-### Automated Setup
-
-**Using bash (Linux/macOS):**
-```bash
-./setup_mcp.sh
-```
-
-**Using Python (cross-platform):**
-```bash
-python setup_mcp.py
-```
-
-The setup script will:
-1. Install the framework package
-2. Install MCP dependencies (mcp, fastmcp)
-3. Create/verify `.mcp.json` configuration
-4. Test the MCP server module
-
-### Manual Setup
-
-If you prefer manual setup:
-
-```bash
-# Install framework
-uv pip install -e .
-
-# Install MCP dependencies
-uv pip install mcp fastmcp
-
-# Test the server
-uv run python -m framework.mcp.agent_builder_server
-```
-
-### Using with MCP Clients
-
-To use the agent builder with Claude Desktop or other MCP clients, add this to your MCP client configuration:
-
-```json
-{
-  "mcpServers": {
-    "agent-builder": {
-      "command": "python",
-      "args": ["-m", "framework.mcp.agent_builder_server"],
-      "cwd": "/path/to/hive/core"
-    }
-  }
-}
-```
-
-The MCP server provides tools for:
- Creating agent building sessions
- Defining goals with success criteria
- Adding nodes (event_loop only)
- Connecting nodes with edges
- Validating and exporting agent graphs
- Testing nodes and full agent graphs
+See the [Getting Started Guide](../docs/getting-started.md) for building agents.

 ## Quick Start

@@ -145,7 +90,7 @@ uv run python -m framework test-debug <agent_path> <test_name>
 uv run python -m framework test-list <agent_path>
 ```

-For detailed testing workflows, see the [hive-test skill](../.claude/skills/hive-test/SKILL.md).
+For detailed testing workflows, see [developer-guide.md](../docs/developer-guide.md).

 ### Analyzing Agent Behavior with Builder

@@ -0,0 +1,583 @@
+#!/usr/bin/env python3
+"""Antigravity authentication CLI.
+
+Implements OAuth2 flow for Google's Antigravity Code Assist gateway.
+Credentials are stored in ~/.hive/antigravity-accounts.json.
+
+Usage:
+    python -m antigravity_auth auth account add
+    python -m antigravity_auth auth account list
+    python -m antigravity_auth auth account remove <email>
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import secrets
+import socket
+import sys
+import time
+import urllib.parse
+import urllib.request
+import webbrowser
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from pathlib import Path
+from typing import Any
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logger = logging.getLogger(__name__)
+
+# OAuth endpoints
+_OAUTH_AUTH_URL = "https://accounts.google.com/o/oauth2/v2/auth"
+_OAUTH_TOKEN_URL = "https://oauth2.googleapis.com/token"
+
+# Scopes for Antigravity/Cloud Code Assist
+_OAUTH_SCOPES = [
+    "https://www.googleapis.com/auth/cloud-platform",
+    "https://www.googleapis.com/auth/userinfo.email",
+    "https://www.googleapis.com/auth/userinfo.profile",
+]
+
+# Credentials file path in ~/.hive/
+_ACCOUNTS_FILE = Path.home() / ".hive" / "antigravity-accounts.json"
+
+# Default project ID
+_DEFAULT_PROJECT_ID = "rising-fact-p41fc"
+_DEFAULT_REDIRECT_PORT = 51121
+
+# OAuth credentials fetched from the opencode-antigravity-auth project.
+# This project reverse-engineered and published the public OAuth credentials
+# for Google's Antigravity/Cloud Code Assist API.
+# Source: https://github.com/NoeFabris/opencode-antigravity-auth
+_CREDENTIALS_URL = (
+    "https://raw.githubusercontent.com/NoeFabris/opencode-antigravity-auth/dev/src/constants.ts"
+)
+
+# Cached credentials fetched from public source
+_cached_client_id: str | None = None
+_cached_client_secret: str | None = None
+
+
+def _fetch_credentials_from_public_source() -> tuple[str | None, str | None]:
+    """Fetch OAuth client ID and secret from the public npm package source on GitHub."""
+    global _cached_client_id, _cached_client_secret
+    if _cached_client_id and _cached_client_secret:
+        return _cached_client_id, _cached_client_secret
+
+    try:
+        req = urllib.request.Request(
+            _CREDENTIALS_URL, headers={"User-Agent": "Hive-Antigravity-Auth/1.0"}
+        )
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            content = resp.read().decode("utf-8")
+            import re
+
+            id_match = re.search(r'ANTIGRAVITY_CLIENT_ID\s*=\s*"([^"]+)"', content)
+            secret_match = re.search(r'ANTIGRAVITY_CLIENT_SECRET\s*=\s*"([^"]+)"', content)
+            if id_match:
+                _cached_client_id = id_match.group(1)
+            if secret_match:
+                _cached_client_secret = secret_match.group(1)
+            return _cached_client_id, _cached_client_secret
+    except Exception as e:
+        logger.debug(f"Failed to fetch credentials from public source: {e}")
+    return None, None
+
+
+def get_client_id() -> str:
+    """Get OAuth client ID from env, config, or public source."""
+    env_id = os.environ.get("ANTIGRAVITY_CLIENT_ID")
+    if env_id:
+        return env_id
+
+    # Try hive config
+    hive_cfg = Path.home() / ".hive" / "configuration.json"
+    if hive_cfg.exists():
+        try:
+            with open(hive_cfg) as f:
+                cfg = json.load(f)
+                cfg_id = cfg.get("llm", {}).get("antigravity_client_id")
+                if cfg_id:
+                    return cfg_id
+        except Exception:
+            pass
+
+    # Fetch from public source
+    client_id, _ = _fetch_credentials_from_public_source()
+    if client_id:
+        return client_id
+
+    raise RuntimeError("Could not obtain Antigravity OAuth client ID")
+
+
+def get_client_secret() -> str | None:
+    """Get OAuth client secret from env, config, or public source."""
+    secret = os.environ.get("ANTIGRAVITY_CLIENT_SECRET")
+    if secret:
+        return secret
+
+    # Try to read from hive config
+    hive_cfg = Path.home() / ".hive" / "configuration.json"
+    if hive_cfg.exists():
+        try:
+            with open(hive_cfg) as f:
+                cfg = json.load(f)
+                secret = cfg.get("llm", {}).get("antigravity_client_secret")
+                if secret:
+                    return secret
+        except Exception:
+            pass
+
+    # Fetch from public source (npm package on GitHub)
+    _, secret = _fetch_credentials_from_public_source()
+    return secret
+
+
+def find_free_port() -> int:
+    """Find an available local port."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        s.listen(1)
+        return s.getsockname()[1]
+
+
+class OAuthCallbackHandler(BaseHTTPRequestHandler):
+    """Handle OAuth callback from browser."""
+
+    auth_code: str | None = None
+    state: str | None = None
+    error: str | None = None
+
+    def log_message(self, format: str, *args: Any) -> None:
+        pass  # Suppress default logging
+
+    def do_GET(self) -> None:
+        parsed = urllib.parse.urlparse(self.path)
+
+        if parsed.path == "/oauth-callback":
+            query = urllib.parse.parse_qs(parsed.query)
+
+            if "error" in query:
+                self.error = query["error"][0]
+                self._send_response("Authentication failed. You can close this window.")
+                return
+
+            if "code" in query and "state" in query:
+                OAuthCallbackHandler.auth_code = query["code"][0]
+                OAuthCallbackHandler.state = query["state"][0]
+                self._send_response(
+                    "Authentication successful! You can close this window "
+                    "and return to the terminal."
+                )
+                return
+
+        self._send_response("Waiting for authentication...")
+
+    def _send_response(self, message: str) -> None:
+        self.send_response(200)
+        self.send_header("Content-Type", "text/html")
+        self.end_headers()
+        html = f"""<!DOCTYPE html>
+<html>
+<head><title>Antigravity Auth</title></head>
+<body style="font-family: system-ui; display: flex; align-items: center;
+      justify-content: center; height: 100vh; margin: 0; background: #1a1a2e;
+      color: #eee;">
+    <div style="text-align: center;">
+        <h2>{message}</h2>
+    </div>
+</body>
+</html>"""
+        self.wfile.write(html.encode())
+
+
+def wait_for_callback(port: int, timeout: int = 300) -> tuple[str | None, str | None, str | None]:
+    """Start local server and wait for OAuth callback."""
+    server = HTTPServer(("localhost", port), OAuthCallbackHandler)
+    server.timeout = 1
+
+    start = time.time()
+    while time.time() - start < timeout:
+        if OAuthCallbackHandler.auth_code:
+            return (
+                OAuthCallbackHandler.auth_code,
+                OAuthCallbackHandler.state,
+                OAuthCallbackHandler.error,
+            )
+        server.handle_request()
+
+    return None, None, "timeout"
+
+
+def exchange_code_for_tokens(
+    code: str, redirect_uri: str, client_id: str, client_secret: str | None
+) -> dict[str, Any] | None:
+    """Exchange authorization code for tokens."""
+    data = {
+        "code": code,
+        "client_id": client_id,
+        "redirect_uri": redirect_uri,
+        "grant_type": "authorization_code",
+    }
+    if client_secret:
+        data["client_secret"] = client_secret
+
+    body = urllib.parse.urlencode(data).encode()
+
+    req = urllib.request.Request(
+        _OAUTH_TOKEN_URL,
+        data=body,
+        headers={"Content-Type": "application/x-www-form-urlencoded"},
+        method="POST",
+    )
+
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return json.loads(resp.read())
+    except Exception as e:
+        logger.error(f"Token exchange failed: {e}")
+        return None
+
+
+def get_user_email(access_token: str) -> str | None:
+    """Get user email from Google API."""
+    req = urllib.request.Request(
+        "https://www.googleapis.com/oauth2/v2/userinfo",
+        headers={"Authorization": f"Bearer {access_token}"},
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            data = json.loads(resp.read())
+            return data.get("email")
+    except Exception:
+        return None
+
+
+def load_accounts() -> dict[str, Any]:
+    """Load existing accounts from file."""
+    if not _ACCOUNTS_FILE.exists():
+        return {"schemaVersion": 4, "accounts": []}
+    try:
+        with open(_ACCOUNTS_FILE) as f:
+            return json.load(f)
+    except Exception:
+        return {"schemaVersion": 4, "accounts": []}
+
+
+def save_accounts(data: dict[str, Any]) -> None:
+    """Save accounts to file."""
+    _ACCOUNTS_FILE.parent.mkdir(parents=True, exist_ok=True)
+    with open(_ACCOUNTS_FILE, "w") as f:
+        json.dump(data, f, indent=2)
+    logger.info(f"Saved credentials to {_ACCOUNTS_FILE}")
+
+
+def validate_credentials(access_token: str, project_id: str = _DEFAULT_PROJECT_ID) -> bool:
+    """Test if credentials work by making a simple API call to Antigravity.
+
+    Returns True if credentials are valid, False otherwise.
+    """
+    endpoint = "https://daily-cloudcode-pa.sandbox.googleapis.com"
+    body = {
+        "project": project_id,
+        "model": "gemini-3-flash",
+        "request": {
+            "contents": [{"role": "user", "parts": [{"text": "hi"}]}],
+            "generationConfig": {"maxOutputTokens": 10},
+        },
+        "requestType": "agent",
+        "userAgent": "antigravity",
+        "requestId": "validation-test",
+    }
+    headers = {
+        "Authorization": f"Bearer {access_token}",
+        "Content-Type": "application/json",
+        "User-Agent": (
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) Antigravity/1.18.3"
+        ),
+        "X-Goog-Api-Client": "google-cloud-sdk vscode_cloudshelleditor/0.1",
+    }
+
+    try:
+        req = urllib.request.Request(
+            f"{endpoint}/v1internal:generateContent",
+            data=json.dumps(body).encode("utf-8"),
+            headers=headers,
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            json.loads(resp.read())
+            return True
+    except Exception:
+        return False
+
+
+def refresh_access_token(
+    refresh_token: str, client_id: str, client_secret: str | None
+) -> dict | None:
+    """Refresh the access token using the refresh token."""
+    data = {
+        "grant_type": "refresh_token",
+        "refresh_token": refresh_token,
+        "client_id": client_id,
+    }
+    if client_secret:
+        data["client_secret"] = client_secret
+
+    body = urllib.parse.urlencode(data).encode()
+    req = urllib.request.Request(
+        _OAUTH_TOKEN_URL,
+        data=body,
+        headers={"Content-Type": "application/x-www-form-urlencoded"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return json.loads(resp.read())
+    except Exception as e:
+        logger.debug(f"Token refresh failed: {e}")
+        return None
+
+
+def cmd_account_add(args: argparse.Namespace) -> int:
+    """Add a new Antigravity account via OAuth2.
+
+    First checks if valid credentials already exist. If so, validates them
+    and skips OAuth if they work. Otherwise, proceeds with OAuth flow.
+    """
+    client_id = get_client_id()
+    client_secret = get_client_secret()
+
+    # Check if credentials already exist
+    accounts_data = load_accounts()
+    accounts = accounts_data.get("accounts", [])
+
+    if accounts:
+        account = next((a for a in accounts if a.get("enabled", True) is not False), accounts[0])
+        access_token = account.get("access")
+        refresh_token_str = account.get("refresh", "")
+        refresh_token = refresh_token_str.split("|")[0] if refresh_token_str else None
+        project_id = (
+            refresh_token_str.split("|")[1] if "|" in refresh_token_str else _DEFAULT_PROJECT_ID
+        )
+        email = account.get("email", "unknown")
+        expires_ms = account.get("expires", 0)
+        expires_at = expires_ms / 1000.0 if expires_ms else 0.0
+
+        # Check if token is expired or near expiry
+        if access_token and expires_at and time.time() < expires_at - 60:
+            # Token still valid, test it
+            logger.info(f"Found existing credentials for: {email}")
+            logger.info("Validating existing credentials...")
+            if validate_credentials(access_token, project_id):
+                logger.info("✓ Credentials valid! Skipping OAuth.")
+                return 0
+            else:
+                logger.info("Credentials failed validation, refreshing...")
+        elif refresh_token:
+            logger.info(f"Found expired credentials for: {email}")
+            logger.info("Attempting token refresh...")
+
+            tokens = refresh_access_token(refresh_token, client_id, client_secret)
+            if tokens:
+                new_access = tokens.get("access_token")
+                expires_in = tokens.get("expires_in", 3600)
+                if new_access:
+                    # Update the account
+                    account["access"] = new_access
+                    account["expires"] = int((time.time() + expires_in) * 1000)
+                    accounts_data["last_refresh"] = time.strftime(
+                        "%Y-%m-%dT%H:%M:%SZ", time.gmtime()
+                    )
+                    save_accounts(accounts_data)
+
+                    # Validate the refreshed token
+                    logger.info("Validating refreshed credentials...")
+                    if validate_credentials(new_access, project_id):
+                        logger.info("✓ Credentials refreshed and validated!")
+                        return 0
+                    else:
+                        logger.info("Refreshed token failed validation, proceeding with OAuth...")
+            else:
+                logger.info("Token refresh failed, proceeding with OAuth...")
+
+    # No valid credentials, proceed with OAuth
+    if not client_secret:
+        logger.warning(
+            "No client secret configured. Token refresh may fail.\n"
+            "Set ANTIGRAVITY_CLIENT_SECRET env var or add "
+            "'antigravity_client_secret' to ~/.hive/configuration.json"
+        )
+
+    # Use fixed port and path matching Google's expected OAuth redirect URI
+    port = _DEFAULT_REDIRECT_PORT
+    redirect_uri = f"http://localhost:{port}/oauth-callback"
+
+    # Generate state for CSRF protection
+    state = secrets.token_urlsafe(16)
+
+    # Build authorization URL
+    params = {
+        "client_id": client_id,
+        "redirect_uri": redirect_uri,
+        "response_type": "code",
+        "scope": " ".join(_OAUTH_SCOPES),
+        "state": state,
+        "access_type": "offline",
+        "prompt": "consent",
+    }
+    auth_url = f"{_OAUTH_AUTH_URL}?{urllib.parse.urlencode(params)}"
+
+    logger.info("Opening browser for authentication...")
+    logger.info(f"If the browser doesn't open, visit: {auth_url}\n")
+
+    # Open browser
+    webbrowser.open(auth_url)
+
+    # Wait for callback
+    logger.info(f"Listening for callback on port {port}...")
+    code, received_state, error = wait_for_callback(port)
+
+    if error:
+        logger.error(f"Authentication failed: {error}")
+        return 1
+
+    if not code:
+        logger.error("No authorization code received")
+        return 1
+
+    if received_state != state:
+        logger.error("State mismatch - possible CSRF attack")
+        return 1
+
+    # Exchange code for tokens
+    logger.info("Exchanging authorization code for tokens...")
+    tokens = exchange_code_for_tokens(code, redirect_uri, client_id, client_secret)
+
+    if not tokens:
+        return 1
+
+    access_token = tokens.get("access_token")
+    refresh_token = tokens.get("refresh_token")
+    expires_in = tokens.get("expires_in", 3600)
+
+    if not access_token:
+        logger.error("No access token in response")
+        return 1
+
+    # Get user email
+    email = get_user_email(access_token)
+    if email:
+        logger.info(f"Authenticated as: {email}")
+
+    # Load existing accounts and add/update
+    accounts_data = load_accounts()
+    accounts = accounts_data.get("accounts", [])
+
+    # Build new account entry (V4 schema)
+    expires_ms = int((time.time() + expires_in) * 1000)
+    refresh_entry = f"{refresh_token}|{_DEFAULT_PROJECT_ID}"
+
+    new_account = {
+        "access": access_token,
+        "refresh": refresh_entry,
+        "expires": expires_ms,
+        "email": email,
+        "enabled": True,
+    }
+
+    # Update existing account or add new one
+    existing_idx = next((i for i, a in enumerate(accounts) if a.get("email") == email), None)
+    if existing_idx is not None:
+        accounts[existing_idx] = new_account
+        logger.info(f"Updated existing account: {email}")
+    else:
+        accounts.append(new_account)
+        logger.info(f"Added new account: {email}")
+
+    accounts_data["accounts"] = accounts
+    accounts_data["schemaVersion"] = 4
+    accounts_data["last_refresh"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+
+    save_accounts(accounts_data)
+    logger.info("\n✓ Authentication complete!")
+    return 0
+
+
+def cmd_account_list(args: argparse.Namespace) -> int:
+    """List all stored accounts."""
+    data = load_accounts()
+    accounts = data.get("accounts", [])
+
+    if not accounts:
+        logger.info("No accounts configured.")
+        logger.info("Run 'antigravity auth account add' to add one.")
+        return 0
+
+    logger.info("Configured accounts:\n")
+    for i, account in enumerate(accounts, 1):
+        email = account.get("email", "unknown")
+        enabled = "enabled" if account.get("enabled", True) else "disabled"
+        logger.info(f"  {i}. {email} ({enabled})")
+
+    return 0
+
+
+def cmd_account_remove(args: argparse.Namespace) -> int:
+    """Remove an account by email."""
+    email = args.email
+    data = load_accounts()
+    accounts = data.get("accounts", [])
+
+    original_len = len(accounts)
+    accounts = [a for a in accounts if a.get("email") != email]
+
+    if len(accounts) == original_len:
+        logger.error(f"No account found with email: {email}")
+        return 1
+
+    data["accounts"] = accounts
+    save_accounts(data)
+    logger.info(f"Removed account: {email}")
+    return 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Antigravity authentication CLI",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Commands")
+
+    # auth account add
+    auth_parser = subparsers.add_parser("auth", help="Authentication commands")
+    auth_subparsers = auth_parser.add_subparsers(dest="auth_command")
+
+    account_parser = auth_subparsers.add_parser("account", help="Account management")
+    account_subparsers = account_parser.add_subparsers(dest="account_command")
+
+    add_parser = account_subparsers.add_parser("add", help="Add a new account via OAuth2")
+    add_parser.set_defaults(func=cmd_account_add)
+
+    list_parser = account_subparsers.add_parser("list", help="List configured accounts")
+    list_parser.set_defaults(func=cmd_account_list)
+
+    remove_parser = account_subparsers.add_parser("remove", help="Remove an account")
+    remove_parser.add_argument("email", help="Email of account to remove")
+    remove_parser.set_defaults(func=cmd_account_remove)
+
+    args = parser.parse_args()
+
+    if hasattr(args, "func"):
+        return args.func(args)
+
+    parser.print_help()
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -17,6 +17,7 @@ import http.server
 import json
 import os
 import platform
+import queue
 import secrets
 import subprocess
 import sys
@@ -27,6 +28,7 @@ import urllib.parse
 import urllib.request
 from datetime import UTC, datetime
 from pathlib import Path
+from typing import TextIO

 # OAuth constants (from the Codex CLI binary)
 CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann"
@@ -165,11 +167,11 @@ def open_browser(url: str) -> bool:
        if system == "Darwin":
            subprocess.Popen(["open", url], stdout=devnull, stderr=devnull)
        elif system == "Windows":
-            subprocess.Popen(["cmd", "/c", "start", url], stdout=devnull, stderr=devnull)
+            os.startfile(url)  # type: ignore[attr-defined]
        else:
            subprocess.Popen(["xdg-open", url], stdout=devnull, stderr=devnull)
        return True
-    except OSError:
+    except (AttributeError, OSError):
        return False


@@ -266,6 +268,71 @@ def parse_manual_input(value: str, expected_state: str) -> str | None:
    return None


+def _read_manual_input_lines(
+    manual_inputs: queue.Queue[str],
+    stop_event: threading.Event,
+    stdin: TextIO | None = None,
+) -> None:
+    stream = sys.stdin if stdin is None else stdin
+
+    while not stop_event.is_set():
+        try:
+            manual = stream.readline()
+        except (EOFError, OSError):
+            return
+
+        if not manual:
+            return
+
+        if manual.strip():
+            manual_inputs.put(manual)
+
+
+def wait_for_code_from_callback_or_stdin(
+    expected_state: str,
+    callback_result: list[str | None],
+    callback_done: threading.Event,
+    timeout_secs: float = 120,
+    poll_interval: float = 0.1,
+    stdin: TextIO | None = None,
+) -> str | None:
+    manual_inputs: queue.Queue[str] = queue.Queue()
+    stop_event = threading.Event()
+
+    # Read stdin on a daemon thread so manual paste works on platforms where
+    # select() cannot poll console handles, including Windows terminals.
+    threading.Thread(
+        target=_read_manual_input_lines,
+        args=(manual_inputs, stop_event, stdin),
+        daemon=True,
+    ).start()
+
+    deadline = time.time() + timeout_secs
+    try:
+        while time.time() < deadline:
+            if callback_result[0]:
+                return callback_result[0]
+
+            while True:
+                try:
+                    manual = manual_inputs.get_nowait()
+                except queue.Empty:
+                    break
+
+                code = parse_manual_input(manual, expected_state)
+                if code:
+                    return code
+
+            if callback_done.is_set():
+                return callback_result[0]
+
+            time.sleep(poll_interval)
+
+        return callback_result[0]
+    finally:
+        stop_event.set()
+
+
 def main() -> int:
    # Generate PKCE and state
    verifier, challenge = generate_pkce()
@@ -315,41 +382,28 @@ def main() -> int:

        # Start callback server in background
        callback_result: list[str | None] = [None]
+        callback_done = threading.Event()

        def run_server() -> None:
-            callback_result[0] = wait_for_callback(state, timeout_secs=120)
+            try:
+                callback_result[0] = wait_for_callback(state, timeout_secs=120)
+            finally:
+                callback_done.set()

        server_thread = threading.Thread(target=run_server)
        server_thread.daemon = True
        server_thread.start()

-        # Also accept manual input in parallel
-        # We poll for both the server result and stdin
        try:
-            import select
-
-            while server_thread.is_alive():
-                # Check if stdin has data (non-blocking on unix)
-                if hasattr(select, "select"):
-                    ready, _, _ = select.select([sys.stdin], [], [], 0.5)
-                    if ready:
-                        manual = sys.stdin.readline()
-                        if manual.strip():
-                            code = parse_manual_input(manual, state)
-                            if code:
-                                break
-                else:
-                    time.sleep(0.5)
-
-                if callback_result[0]:
-                    code = callback_result[0]
-                    break
-        except (KeyboardInterrupt, EOFError):
+            code = wait_for_code_from_callback_or_stdin(
+                state,
+                callback_result,
+                callback_done,
+                timeout_secs=120,
+            )
+        except KeyboardInterrupt:
            print("\n\033[0;31mCancelled.\033[0m")
            return 1
-
-        if not code:
-            code = callback_result[0]
    else:
        # Manual paste mode
        try:
@@ -1,740 +0,0 @@
-#!/usr/bin/env python3
-"""
-EventLoopNode WebSocket Demo
-
-Real LLM, real FileConversationStore, real EventBus.
-Streams EventLoopNode execution to a browser via WebSocket.
-
-Usage:
-    cd /home/timothy/oss/hive/core
-    python demos/event_loop_wss_demo.py
-
-    Then open http://localhost:8765 in your browser.
-"""
-
-import asyncio
-import json
-import logging
-import sys
-import tempfile
-from http import HTTPStatus
-from pathlib import Path
-
-import httpx
-import websockets
-from bs4 import BeautifulSoup
-from websockets.http11 import Request, Response
-
-# Add core, tools, and hive root to path
-_CORE_DIR = Path(__file__).resolve().parent.parent
-_HIVE_DIR = _CORE_DIR.parent
-sys.path.insert(0, str(_CORE_DIR))  # framework.*
-sys.path.insert(0, str(_HIVE_DIR / "tools" / "src"))  # aden_tools.*
-sys.path.insert(0, str(_HIVE_DIR))  # core.framework.* (for aden_tools imports)
-
-import os  # noqa: E402
-
-from aden_tools.credentials import CREDENTIAL_SPECS, CredentialStoreAdapter  # noqa: E402
-from core.framework.credentials import CredentialStore  # noqa: E402
-
-from framework.credentials.storage import (  # noqa: E402
-    CompositeStorage,
-    EncryptedFileStorage,
-    EnvVarStorage,
-)
-from framework.graph.event_loop_node import EventLoopNode, LoopConfig  # noqa: E402
-from framework.graph.node import NodeContext, NodeSpec, SharedMemory  # noqa: E402
-from framework.llm.litellm import LiteLLMProvider  # noqa: E402
-from framework.llm.provider import Tool  # noqa: E402
-from framework.runner.tool_registry import ToolRegistry  # noqa: E402
-from framework.runtime.core import Runtime  # noqa: E402
-from framework.runtime.event_bus import EventBus, EventType  # noqa: E402
-from framework.storage.conversation_store import FileConversationStore  # noqa: E402
-
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(message)s")
-logger = logging.getLogger("demo")
-
-# -------------------------------------------------------------------------
-# Persistent state (shared across WebSocket connections)
-# -------------------------------------------------------------------------
-
-STORE_DIR = Path(tempfile.mkdtemp(prefix="hive_demo_"))
-STORE = FileConversationStore(STORE_DIR / "conversation")
-RUNTIME = Runtime(STORE_DIR / "runtime")
-LLM = LiteLLMProvider(model="claude-sonnet-4-5-20250929")
-
-# -------------------------------------------------------------------------
-# Tool Registry — real tools via ToolRegistry (same pattern as GraphExecutor)
-# -------------------------------------------------------------------------
-
-TOOL_REGISTRY = ToolRegistry()
-
-# Credential store: Aden sync (OAuth2 tokens) + encrypted files + env var fallback
-_env_mapping = {name: spec.env_var for name, spec in CREDENTIAL_SPECS.items()}
-_local_storage = CompositeStorage(
-    primary=EncryptedFileStorage(),
-    fallbacks=[EnvVarStorage(env_mapping=_env_mapping)],
-)
-
-if os.environ.get("ADEN_API_KEY"):
-    try:
-        from framework.credentials.aden import (  # noqa: E402
-            AdenCachedStorage,
-            AdenClientConfig,
-            AdenCredentialClient,
-            AdenSyncProvider,
-        )
-
-        _client = AdenCredentialClient(AdenClientConfig(base_url="https://api.adenhq.com"))
-        _provider = AdenSyncProvider(client=_client)
-        _storage = AdenCachedStorage(
-            local_storage=_local_storage,
-            aden_provider=_provider,
-        )
-        _cred_store = CredentialStore(storage=_storage, providers=[_provider], auto_refresh=True)
-        _synced = _provider.sync_all(_cred_store)
-        logger.info("Synced %d credentials from Aden", _synced)
-    except Exception as e:
-        logger.warning("Aden sync unavailable: %s", e)
-        _cred_store = CredentialStore(storage=_local_storage)
-else:
-    logger.info("ADEN_API_KEY not set, using local credential storage")
-    _cred_store = CredentialStore(storage=_local_storage)
-
-CREDENTIALS = CredentialStoreAdapter(_cred_store)
-
-# Debug: log which credentials resolved
-for _name in ["brave_search", "hubspot", "anthropic"]:
-    _val = CREDENTIALS.get(_name)
-    if _val:
-        logger.debug("credential %s: OK (len=%d)", _name, len(_val))
-    else:
-        logger.debug("credential %s: not found", _name)
-
-# --- web_search (Brave Search API) ---
-
-TOOL_REGISTRY.register(
-    name="web_search",
-    tool=Tool(
-        name="web_search",
-        description=(
-            "Search the web for current information. "
-            "Returns titles, URLs, and snippets from search results."
-        ),
-        parameters={
-            "type": "object",
-            "properties": {
-                "query": {
-                    "type": "string",
-                    "description": "The search query (1-500 characters)",
-                },
-                "num_results": {
-                    "type": "integer",
-                    "description": "Number of results to return (1-20, default 10)",
-                },
-            },
-            "required": ["query"],
-        },
-    ),
-    executor=lambda inputs: _exec_web_search(inputs),
-)
-
-
-def _exec_web_search(inputs: dict) -> dict:
-    api_key = CREDENTIALS.get("brave_search")
-    if not api_key:
-        return {"error": "brave_search credential not configured"}
-    query = inputs.get("query", "")
-    num_results = min(inputs.get("num_results", 10), 20)
-    resp = httpx.get(
-        "https://api.search.brave.com/res/v1/web/search",
-        params={"q": query, "count": num_results},
-        headers={"X-Subscription-Token": api_key, "Accept": "application/json"},
-        timeout=30.0,
-    )
-    if resp.status_code != 200:
-        return {"error": f"Brave API HTTP {resp.status_code}"}
-    data = resp.json()
-    results = [
-        {
-            "title": item.get("title", ""),
-            "url": item.get("url", ""),
-            "snippet": item.get("description", ""),
-        }
-        for item in data.get("web", {}).get("results", [])[:num_results]
-    ]
-    return {"query": query, "results": results, "total": len(results)}
-
-
-# --- web_scrape (httpx + BeautifulSoup, no playwright for sync compat) ---
-
-TOOL_REGISTRY.register(
-    name="web_scrape",
-    tool=Tool(
-        name="web_scrape",
-        description=(
-            "Scrape and extract text content from a webpage URL. "
-            "Returns the page title and main text content."
-        ),
-        parameters={
-            "type": "object",
-            "properties": {
-                "url": {
-                    "type": "string",
-                    "description": "URL of the webpage to scrape",
-                },
-                "max_length": {
-                    "type": "integer",
-                    "description": "Maximum text length (default 50000)",
-                },
-            },
-            "required": ["url"],
-        },
-    ),
-    executor=lambda inputs: _exec_web_scrape(inputs),
-)
-
-_SCRAPE_HEADERS = {
-    "User-Agent": (
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-        "AppleWebKit/537.36 (KHTML, like Gecko) "
-        "Chrome/131.0.0.0 Safari/537.36"
-    ),
-    "Accept": "text/html,application/xhtml+xml",
-}
-
-
-def _exec_web_scrape(inputs: dict) -> dict:
-    url = inputs.get("url", "")
-    max_length = max(1000, min(inputs.get("max_length", 50000), 500000))
-    if not url.startswith(("http://", "https://")):
-        url = "https://" + url
-    try:
-        resp = httpx.get(url, timeout=30.0, follow_redirects=True, headers=_SCRAPE_HEADERS)
-        if resp.status_code != 200:
-            return {"error": f"HTTP {resp.status_code}"}
-        soup = BeautifulSoup(resp.text, "html.parser")
-        for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
-            tag.decompose()
-        title = soup.title.get_text(strip=True) if soup.title else ""
-        main = (
-            soup.find("article")
-            or soup.find("main")
-            or soup.find(attrs={"role": "main"})
-            or soup.find("body")
-        )
-        text = main.get_text(separator=" ", strip=True) if main else ""
-        text = " ".join(text.split())
-        if len(text) > max_length:
-            text = text[:max_length] + "..."
-        return {"url": url, "title": title, "content": text, "length": len(text)}
-    except httpx.TimeoutException:
-        return {"error": "Request timed out"}
-    except Exception as e:
-        return {"error": f"Scrape failed: {e}"}
-
-
-# --- HubSpot CRM tools (optional, requires HUBSPOT_ACCESS_TOKEN) ---
-
-_HUBSPOT_API = "https://api.hubapi.com"
-
-
-def _hubspot_headers() -> dict | None:
-    token = CREDENTIALS.get("hubspot")
-    if token:
-        logger.debug("HubSpot token: %s...%s (len=%d)", token[:8], token[-4:], len(token))
-    else:
-        logger.debug("HubSpot token: not found")
-    if not token:
-        return None
-    return {
-        "Authorization": f"Bearer {token}",
-        "Content-Type": "application/json",
-        "Accept": "application/json",
-    }
-
-
-def _exec_hubspot_search(inputs: dict) -> dict:
-    headers = _hubspot_headers()
-    if not headers:
-        return {"error": "HUBSPOT_ACCESS_TOKEN not set"}
-    object_type = inputs.get("object_type", "contacts")
-    query = inputs.get("query", "")
-    limit = min(inputs.get("limit", 10), 100)
-    body: dict = {"limit": limit}
-    if query:
-        body["query"] = query
-    try:
-        resp = httpx.post(
-            f"{_HUBSPOT_API}/crm/v3/objects/{object_type}/search",
-            headers=headers,
-            json=body,
-            timeout=30.0,
-        )
-        if resp.status_code != 200:
-            return {"error": f"HubSpot API HTTP {resp.status_code}: {resp.text[:200]}"}
-        return resp.json()
-    except httpx.TimeoutException:
-        return {"error": "Request timed out"}
-    except Exception as e:
-        return {"error": f"HubSpot error: {e}"}
-
-
-TOOL_REGISTRY.register(
-    name="hubspot_search",
-    tool=Tool(
-        name="hubspot_search",
-        description=(
-            "Search HubSpot CRM objects (contacts, companies, or deals). "
-            "Returns matching records with their properties."
-        ),
-        parameters={
-            "type": "object",
-            "properties": {
-                "object_type": {
-                    "type": "string",
-                    "description": "CRM object type: 'contacts', 'companies', or 'deals'",
-                },
-                "query": {
-                    "type": "string",
-                    "description": "Search query (name, email, domain, etc.)",
-                },
-                "limit": {
-                    "type": "integer",
-                    "description": "Max results (1-100, default 10)",
-                },
-            },
-            "required": ["object_type"],
-        },
-    ),
-    executor=lambda inputs: _exec_hubspot_search(inputs),
-)
-
-logger.info(
-    "ToolRegistry loaded: %s",
-    ", ".join(TOOL_REGISTRY.get_registered_names()),
-)
-
-
-# -------------------------------------------------------------------------
-# HTML page (embedded)
-# -------------------------------------------------------------------------
-
-HTML_PAGE = (  # noqa: E501
-    """<!DOCTYPE html>
-<html lang="en">
-<head>
-<meta charset="utf-8">
-<meta name="viewport" content="width=device-width, initial-scale=1">
-<title>EventLoopNode Live Demo</title>
-<style>
-  * { box-sizing: border-box; margin: 0; padding: 0; }
-  body {
-    font-family: 'SF Mono', 'Fira Code', monospace;
-    background: #0d1117; color: #c9d1d9;
-    height: 100vh; display: flex; flex-direction: column;
-  }
-  header {
-    background: #161b22; padding: 12px 20px;
-    border-bottom: 1px solid #30363d;
-    display: flex; align-items: center; gap: 16px;
-  }
-  header h1 { font-size: 16px; color: #58a6ff; font-weight: 600; }
-  .status {
-    font-size: 12px; padding: 3px 10px; border-radius: 12px;
-    background: #21262d; color: #8b949e;
-  }
-  .status.running { background: #1a4b2e; color: #3fb950; }
-  .status.done { background: #1a3a5c; color: #58a6ff; }
-  .status.error { background: #4b1a1a; color: #f85149; }
-  .chat { flex: 1; overflow-y: auto; padding: 16px; }
-  .msg {
-    margin: 8px 0; padding: 10px 14px; border-radius: 8px;
-    line-height: 1.6; white-space: pre-wrap; word-wrap: break-word;
-  }
-  .msg.user { background: #1a3a5c; color: #58a6ff; }
-  .msg.assistant { background: #161b22; color: #c9d1d9; }
-  .msg.event {
-    background: transparent; color: #8b949e; font-size: 11px;
-    padding: 4px 14px; border-left: 3px solid #30363d;
-  }
-  .msg.event.loop { border-left-color: #58a6ff; }
-  .msg.event.tool { border-left-color: #d29922; }
-  .msg.event.stall { border-left-color: #f85149; }
-  .input-bar {
-    padding: 12px 16px; background: #161b22;
-    border-top: 1px solid #30363d; display: flex; gap: 8px;
-  }
-  .input-bar input {
-    flex: 1; background: #0d1117; border: 1px solid #30363d;
-    color: #c9d1d9; padding: 8px 12px; border-radius: 6px;
-    font-family: inherit; font-size: 14px; outline: none;
-  }
-  .input-bar input:focus { border-color: #58a6ff; }
-  .input-bar button {
-    background: #238636; color: #fff; border: none;
-    padding: 8px 20px; border-radius: 6px; cursor: pointer;
-    font-family: inherit; font-weight: 600;
-  }
-  .input-bar button:hover { background: #2ea043; }
-  .input-bar button:disabled {
-    background: #21262d; color: #484f58; cursor: not-allowed;
-  }
-  .input-bar button.clear { background: #da3633; }
-  .input-bar button.clear:hover { background: #f85149; }
-</style>
-</head>
-<body>
-  <header>
-    <h1>EventLoopNode Live</h1>
-    <span id="status" class="status">Idle</span>
-    <span id="iter" class="status" style="display:none">Step 0</span>
-  </header>
-  <div id="chat" class="chat"></div>
-  <div class="input-bar">
-    <input id="input" type="text"
-           placeholder="Ask anything..." autofocus />
-    <button id="go" onclick="run()">Send</button>
-    <button class="clear"
-            onclick="clearConversation()">Clear</button>
-  </div>
-
-<script>
-let ws = null;
-let currentAssistantEl = null;
-let iterCount = 0;
-const chat = document.getElementById('chat');
-const status = document.getElementById('status');
-const iterEl = document.getElementById('iter');
-const goBtn = document.getElementById('go');
-const inputEl = document.getElementById('input');
-
-inputEl.addEventListener('keydown', e => {
-  if (e.key === 'Enter') run();
-});
-
-function setStatus(text, cls) {
-  status.textContent = text;
-  status.className = 'status ' + cls;
-}
-
-function addMsg(text, cls) {
-  const el = document.createElement('div');
-  el.className = 'msg ' + cls;
-  el.textContent = text;
-  chat.appendChild(el);
-  chat.scrollTop = chat.scrollHeight;
-  return el;
-}
-
-function connect() {
-  ws = new WebSocket('ws://' + location.host + '/ws');
-  ws.onopen = () => {
-    setStatus('Ready', 'done');
-    goBtn.disabled = false;
-  };
-  ws.onmessage = handleEvent;
-  ws.onerror = () => { setStatus('Error', 'error'); };
-  ws.onclose = () => {
-    setStatus('Reconnecting...', '');
-    goBtn.disabled = true;
-    setTimeout(connect, 2000);
-  };
-}
-
-function handleEvent(msg) {
-  const evt = JSON.parse(msg.data);
-
-  if (evt.type === 'llm_text_delta') {
-    if (currentAssistantEl) {
-      currentAssistantEl.textContent += evt.content;
-      chat.scrollTop = chat.scrollHeight;
-    }
-  }
-  else if (evt.type === 'ready') {
-    setStatus('Ready', 'done');
-    if (currentAssistantEl && !currentAssistantEl.textContent)
-      currentAssistantEl.remove();
-    goBtn.disabled = false;
-  }
-  else if (evt.type === 'node_loop_iteration') {
-    iterCount = evt.iteration || (iterCount + 1);
-    iterEl.textContent = 'Step ' + iterCount;
-    iterEl.style.display = '';
-  }
-  else if (evt.type === 'tool_call_started') {
-    var info = evt.tool_name + '('
-      + JSON.stringify(evt.tool_input).slice(0, 120) + ')';
-    addMsg('TOOL  ' + info, 'event tool');
-  }
-  else if (evt.type === 'tool_call_completed') {
-    var preview = (evt.result || '').slice(0, 200);
-    var cls = evt.is_error ? 'stall' : 'tool';
-    addMsg('RESULT  ' + evt.tool_name + ': ' + preview,
-           'event ' + cls);
-    currentAssistantEl = addMsg('', 'assistant');
-  }
-  else if (evt.type === 'result') {
-    setStatus('Session ended', evt.success ? 'done' : 'error');
-    if (evt.error) addMsg('ERROR  ' + evt.error, 'event stall');
-    if (currentAssistantEl && !currentAssistantEl.textContent)
-      currentAssistantEl.remove();
-    goBtn.disabled = false;
-  }
-  else if (evt.type === 'node_stalled') {
-    addMsg('STALLED  ' + evt.reason, 'event stall');
-  }
-  else if (evt.type === 'cleared') {
-    chat.innerHTML = '';
-    iterCount = 0;
-    iterEl.textContent = 'Step 0';
-    iterEl.style.display = 'none';
-    setStatus('Ready', 'done');
-    goBtn.disabled = false;
-  }
-}
-
-function run() {
-  const text = inputEl.value.trim();
-  if (!text || !ws || ws.readyState !== 1) return;
-  addMsg(text, 'user');
-  currentAssistantEl = addMsg('', 'assistant');
-  inputEl.value = '';
-  setStatus('Running', 'running');
-  goBtn.disabled = true;
-  ws.send(JSON.stringify({ topic: text }));
-}
-
-function clearConversation() {
-  if (ws && ws.readyState === 1) {
-    ws.send(JSON.stringify({ command: 'clear' }));
-  }
-}
-
-connect();
-</script>
-</body>
-</html>"""
-)
-
-
-# -------------------------------------------------------------------------
-# WebSocket handler
-# -------------------------------------------------------------------------
-
-
-async def handle_ws(websocket):
-    """Persistent WebSocket: long-lived EventLoopNode with client_facing blocking."""
-    global STORE
-
-    # -- Event forwarding (WebSocket ← EventBus) ----------------------------
-    bus = EventBus()
-
-    async def forward_event(event):
-        try:
-            payload = {"type": event.type.value, **event.data}
-            if event.node_id:
-                payload["node_id"] = event.node_id
-            await websocket.send(json.dumps(payload))
-        except Exception:
-            pass
-
-    bus.subscribe(
-        event_types=[
-            EventType.NODE_LOOP_STARTED,
-            EventType.NODE_LOOP_ITERATION,
-            EventType.NODE_LOOP_COMPLETED,
-            EventType.LLM_TEXT_DELTA,
-            EventType.TOOL_CALL_STARTED,
-            EventType.TOOL_CALL_COMPLETED,
-            EventType.NODE_STALLED,
-        ],
-        handler=forward_event,
-    )
-
-    # -- Per-connection state -----------------------------------------------
-    node = None
-    loop_task = None
-
-    tools = list(TOOL_REGISTRY.get_tools().values())
-    tool_executor = TOOL_REGISTRY.get_executor()
-
-    node_spec = NodeSpec(
-        id="assistant",
-        name="Chat Assistant",
-        description="A conversational assistant that remembers context across messages",
-        node_type="event_loop",
-        client_facing=True,
-        system_prompt=(
-            "You are a helpful assistant with access to tools. "
-            "You can search the web, scrape webpages, and query HubSpot CRM. "
-            "Use tools when the user asks for current information or external data. "
-            "You have full conversation history, so you can reference previous messages."
-        ),
-    )
-
-    # -- Ready callback: subscribe to CLIENT_INPUT_REQUESTED on the bus ---
-    async def on_input_requested(event):
-        try:
-            await websocket.send(json.dumps({"type": "ready"}))
-        except Exception:
-            pass
-
-    bus.subscribe(
-        event_types=[EventType.CLIENT_INPUT_REQUESTED],
-        handler=on_input_requested,
-    )
-
-    async def start_loop(first_message: str):
-        """Create an EventLoopNode and run it as a background task."""
-        nonlocal node, loop_task
-
-        memory = SharedMemory()
-        ctx = NodeContext(
-            runtime=RUNTIME,
-            node_id="assistant",
-            node_spec=node_spec,
-            memory=memory,
-            input_data={},
-            llm=LLM,
-            available_tools=tools,
-        )
-        node = EventLoopNode(
-            event_bus=bus,
-            config=LoopConfig(max_iterations=10_000, max_history_tokens=32_000),
-            conversation_store=STORE,
-            tool_executor=tool_executor,
-        )
-        await node.inject_event(first_message)
-
-        async def _run():
-            try:
-                result = await node.execute(ctx)
-                try:
-                    await websocket.send(
-                        json.dumps(
-                            {
-                                "type": "result",
-                                "success": result.success,
-                                "output": result.output,
-                                "error": result.error,
-                                "tokens": result.tokens_used,
-                            }
-                        )
-                    )
-                except Exception:
-                    pass
-                logger.info(f"Loop ended: success={result.success}, tokens={result.tokens_used}")
-            except websockets.exceptions.ConnectionClosed:
-                logger.info("Loop stopped: WebSocket closed")
-            except Exception as e:
-                logger.exception("Loop error")
-                try:
-                    await websocket.send(
-                        json.dumps(
-                            {
-                                "type": "result",
-                                "success": False,
-                                "error": str(e),
-                                "output": {},
-                            }
-                        )
-                    )
-                except Exception:
-                    pass
-
-        loop_task = asyncio.create_task(_run())
-
-    async def stop_loop():
-        """Signal the node and wait for the loop task to finish."""
-        nonlocal node, loop_task
-        if loop_task and not loop_task.done():
-            if node:
-                node.signal_shutdown()
-            try:
-                await asyncio.wait_for(loop_task, timeout=5.0)
-            except (TimeoutError, asyncio.CancelledError):
-                loop_task.cancel()
-        node = None
-        loop_task = None
-
-    # -- Message loop (runs for the lifetime of this WebSocket) -------------
-    try:
-        async for raw in websocket:
-            try:
-                msg = json.loads(raw)
-            except Exception:
-                continue
-
-            # Clear command
-            if msg.get("command") == "clear":
-                import shutil
-
-                await stop_loop()
-                await STORE.close()
-                conv_dir = STORE_DIR / "conversation"
-                if conv_dir.exists():
-                    shutil.rmtree(conv_dir)
-                STORE = FileConversationStore(conv_dir)
-                await websocket.send(json.dumps({"type": "cleared"}))
-                logger.info("Conversation cleared")
-                continue
-
-            topic = msg.get("topic", "")
-            if not topic:
-                continue
-
-            if node is None:
-                # First message — spin up the loop
-                logger.info(f"Starting persistent loop: {topic}")
-                await start_loop(topic)
-            else:
-                # Subsequent message — inject into the running loop
-                logger.info(f"Injecting message: {topic}")
-                await node.inject_event(topic)
-
-    except websockets.exceptions.ConnectionClosed:
-        pass
-    finally:
-        await stop_loop()
-        logger.info("WebSocket closed, loop stopped")
-
-
-# -------------------------------------------------------------------------
-# HTTP handler for serving the HTML page
-# -------------------------------------------------------------------------
-
-
-async def process_request(connection, request: Request):
-    """Serve HTML on GET /, upgrade to WebSocket on /ws."""
-    if request.path == "/ws":
-        return None  # let websockets handle the upgrade
-    # Serve the HTML page for any other path
-    return Response(
-        HTTPStatus.OK,
-        "OK",
-        websockets.Headers({"Content-Type": "text/html; charset=utf-8"}),
-        HTML_PAGE.encode(),
-    )
-
-
-# -------------------------------------------------------------------------
-# Main
-# -------------------------------------------------------------------------
-
-
-async def main():
-    port = 8765
-    async with websockets.serve(
-        handle_ws,
-        "0.0.0.0",
-        port,
-        process_request=process_request,
-    ):
-        logger.info(f"Demo running at http://localhost:{port}")
-        logger.info("Open in your browser and enter a topic to research.")
-        await asyncio.Future()  # run forever
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
@@ -1,930 +0,0 @@
-#!/usr/bin/env python3
-"""
-Two-Node ContextHandoff Demo
-
-Demonstrates ContextHandoff between two EventLoopNode instances:
-  Node A (Researcher) → ContextHandoff → Node B (Analyst)
-
-Real LLM, real FileConversationStore, real EventBus.
-Streams both nodes to a browser via WebSocket.
-
-Usage:
-    cd /home/timothy/oss/hive/core
-    python demos/handoff_demo.py
-
-    Then open http://localhost:8766 in your browser.
-"""
-
-import asyncio
-import json
-import logging
-import sys
-import tempfile
-from http import HTTPStatus
-from pathlib import Path
-
-import httpx
-import websockets
-from bs4 import BeautifulSoup
-from websockets.http11 import Request, Response
-
-# Add core, tools, and hive root to path
-_CORE_DIR = Path(__file__).resolve().parent.parent
-_HIVE_DIR = _CORE_DIR.parent
-sys.path.insert(0, str(_CORE_DIR))  # framework.*
-sys.path.insert(0, str(_HIVE_DIR / "tools" / "src"))  # aden_tools.*
-sys.path.insert(0, str(_HIVE_DIR))  # core.framework.* (for aden_tools imports)
-
-from aden_tools.credentials import CREDENTIAL_SPECS, CredentialStoreAdapter  # noqa: E402
-from core.framework.credentials import CredentialStore  # noqa: E402
-
-from framework.credentials.storage import (  # noqa: E402
-    CompositeStorage,
-    EncryptedFileStorage,
-    EnvVarStorage,
-)
-from framework.graph.context_handoff import ContextHandoff  # noqa: E402
-from framework.graph.conversation import NodeConversation  # noqa: E402
-from framework.graph.event_loop_node import EventLoopNode, LoopConfig  # noqa: E402
-from framework.graph.node import NodeContext, NodeSpec, SharedMemory  # noqa: E402
-from framework.llm.litellm import LiteLLMProvider  # noqa: E402
-from framework.llm.provider import Tool  # noqa: E402
-from framework.runner.tool_registry import ToolRegistry  # noqa: E402
-from framework.runtime.core import Runtime  # noqa: E402
-from framework.runtime.event_bus import EventBus, EventType  # noqa: E402
-from framework.storage.conversation_store import FileConversationStore  # noqa: E402
-
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(message)s")
-logger = logging.getLogger("handoff_demo")
-
-# -------------------------------------------------------------------------
-# Persistent state
-# -------------------------------------------------------------------------
-
-STORE_DIR = Path(tempfile.mkdtemp(prefix="hive_handoff_"))
-RUNTIME = Runtime(STORE_DIR / "runtime")
-LLM = LiteLLMProvider(model="claude-sonnet-4-5-20250929")
-
-# -------------------------------------------------------------------------
-# Credentials
-# -------------------------------------------------------------------------
-
-# Composite credential store: encrypted files (primary) + env vars (fallback)
-_env_mapping = {name: spec.env_var for name, spec in CREDENTIAL_SPECS.items()}
-_composite = CompositeStorage(
-    primary=EncryptedFileStorage(),
-    fallbacks=[EnvVarStorage(env_mapping=_env_mapping)],
-)
-CREDENTIALS = CredentialStoreAdapter(CredentialStore(storage=_composite))
-
-for _name in ["brave_search", "hubspot"]:
-    _val = CREDENTIALS.get(_name)
-    if _val:
-        logger.debug("credential %s: OK (len=%d)", _name, len(_val))
-    else:
-        logger.debug("credential %s: not found", _name)
-
-# -------------------------------------------------------------------------
-# Tool Registry — web_search + web_scrape for Node A (Researcher)
-# -------------------------------------------------------------------------
-
-TOOL_REGISTRY = ToolRegistry()
-
-
-def _exec_web_search(inputs: dict) -> dict:
-    api_key = CREDENTIALS.get("brave_search")
-    if not api_key:
-        return {"error": "brave_search credential not configured"}
-    query = inputs.get("query", "")
-    num_results = min(inputs.get("num_results", 10), 20)
-    resp = httpx.get(
-        "https://api.search.brave.com/res/v1/web/search",
-        params={"q": query, "count": num_results},
-        headers={
-            "X-Subscription-Token": api_key,
-            "Accept": "application/json",
-        },
-        timeout=30.0,
-    )
-    if resp.status_code != 200:
-        return {"error": f"Brave API HTTP {resp.status_code}"}
-    data = resp.json()
-    results = [
-        {
-            "title": item.get("title", ""),
-            "url": item.get("url", ""),
-            "snippet": item.get("description", ""),
-        }
-        for item in data.get("web", {}).get("results", [])[:num_results]
-    ]
-    return {"query": query, "results": results, "total": len(results)}
-
-
-TOOL_REGISTRY.register(
-    name="web_search",
-    tool=Tool(
-        name="web_search",
-        description=(
-            "Search the web for current information. "
-            "Returns titles, URLs, and snippets from search results."
-        ),
-        parameters={
-            "type": "object",
-            "properties": {
-                "query": {
-                    "type": "string",
-                    "description": "The search query (1-500 characters)",
-                },
-                "num_results": {
-                    "type": "integer",
-                    "description": "Number of results (1-20, default 10)",
-                },
-            },
-            "required": ["query"],
-        },
-    ),
-    executor=lambda inputs: _exec_web_search(inputs),
-)
-
-_SCRAPE_HEADERS = {
-    "User-Agent": (
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-        "AppleWebKit/537.36 (KHTML, like Gecko) "
-        "Chrome/131.0.0.0 Safari/537.36"
-    ),
-    "Accept": "text/html,application/xhtml+xml",
-}
-
-
-def _exec_web_scrape(inputs: dict) -> dict:
-    url = inputs.get("url", "")
-    max_length = max(1000, min(inputs.get("max_length", 50000), 500000))
-    if not url.startswith(("http://", "https://")):
-        url = "https://" + url
-    try:
-        resp = httpx.get(
-            url,
-            timeout=30.0,
-            follow_redirects=True,
-            headers=_SCRAPE_HEADERS,
-        )
-        if resp.status_code != 200:
-            return {"error": f"HTTP {resp.status_code}"}
-        soup = BeautifulSoup(resp.text, "html.parser")
-        for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
-            tag.decompose()
-        title = soup.title.get_text(strip=True) if soup.title else ""
-        main = (
-            soup.find("article")
-            or soup.find("main")
-            or soup.find(attrs={"role": "main"})
-            or soup.find("body")
-        )
-        text = main.get_text(separator=" ", strip=True) if main else ""
-        text = " ".join(text.split())
-        if len(text) > max_length:
-            text = text[:max_length] + "..."
-        return {
-            "url": url,
-            "title": title,
-            "content": text,
-            "length": len(text),
-        }
-    except httpx.TimeoutException:
-        return {"error": "Request timed out"}
-    except Exception as e:
-        return {"error": f"Scrape failed: {e}"}
-
-
-TOOL_REGISTRY.register(
-    name="web_scrape",
-    tool=Tool(
-        name="web_scrape",
-        description=(
-            "Scrape and extract text content from a webpage URL. "
-            "Returns the page title and main text content."
-        ),
-        parameters={
-            "type": "object",
-            "properties": {
-                "url": {
-                    "type": "string",
-                    "description": "URL of the webpage to scrape",
-                },
-                "max_length": {
-                    "type": "integer",
-                    "description": "Maximum text length (default 50000)",
-                },
-            },
-            "required": ["url"],
-        },
-    ),
-    executor=lambda inputs: _exec_web_scrape(inputs),
-)
-
-logger.info(
-    "ToolRegistry loaded: %s",
-    ", ".join(TOOL_REGISTRY.get_registered_names()),
-)
-
-# -------------------------------------------------------------------------
-# Node Specs
-# -------------------------------------------------------------------------
-
-RESEARCHER_SPEC = NodeSpec(
-    id="researcher",
-    name="Researcher",
-    description="Researches a topic using web search and scraping tools",
-    node_type="event_loop",
-    input_keys=["topic"],
-    output_keys=["research_summary"],
-    system_prompt=(
-        "You are a thorough research assistant. Your job is to research "
-        "the given topic using the web_search and web_scrape tools.\n\n"
-        "1. Search for relevant information on the topic\n"
-        "2. Scrape 1-2 of the most promising URLs for details\n"
-        "3. Synthesize your findings into a comprehensive summary\n"
-        "4. Use set_output with key='research_summary' to save your "
-        "findings\n\n"
-        "Be thorough but efficient. Aim for 2-4 search/scrape calls, "
-        "then summarize and set_output."
-    ),
-)
-
-ANALYST_SPEC = NodeSpec(
-    id="analyst",
-    name="Analyst",
-    description="Analyzes research findings and provides insights",
-    node_type="event_loop",
-    input_keys=["context"],
-    output_keys=["analysis"],
-    system_prompt=(
-        "You are a strategic analyst. You receive research findings from "
-        "a previous researcher and must:\n\n"
-        "1. Identify key themes and patterns\n"
-        "2. Assess the reliability and significance of the findings\n"
-        "3. Provide actionable insights and recommendations\n"
-        "4. Use set_output with key='analysis' to save your analysis\n\n"
-        "Be concise but insightful. Focus on what matters most."
-    ),
-)
-
-
-# -------------------------------------------------------------------------
-# HTML page
-# -------------------------------------------------------------------------
-
-HTML_PAGE = (  # noqa: E501
-    """<!DOCTYPE html>
-<html lang="en">
-<head>
-<meta charset="utf-8">
-<meta name="viewport" content="width=device-width, initial-scale=1">
-<title>ContextHandoff Demo</title>
-<style>
-  * {
-    box-sizing: border-box;
-    margin: 0;
-    padding: 0;
-  }
-  body {
-    font-family: 'SF Mono', 'Fira Code', monospace;
-    background: #0d1117;
-    color: #c9d1d9;
-    height: 100vh;
-    display: flex;
-    flex-direction: column;
-  }
-  header {
-    background: #161b22;
-    padding: 12px 20px;
-    border-bottom: 1px solid #30363d;
-    display: flex;
-    align-items: center;
-    gap: 16px;
-  }
-  header h1 {
-    font-size: 16px;
-    color: #58a6ff;
-    font-weight: 600;
-  }
-  .badge {
-    font-size: 12px;
-    padding: 3px 10px;
-    border-radius: 12px;
-    background: #21262d;
-    color: #8b949e;
-  }
-  .badge.researcher {
-    background: #1a3a5c;
-    color: #58a6ff;
-  }
-  .badge.analyst {
-    background: #1a4b2e;
-    color: #3fb950;
-  }
-  .badge.handoff {
-    background: #3d1f00;
-    color: #d29922;
-  }
-  .badge.done {
-    background: #21262d;
-    color: #8b949e;
-  }
-  .badge.error {
-    background: #4b1a1a;
-    color: #f85149;
-  }
-  .chat {
-    flex: 1;
-    overflow-y: auto;
-    padding: 16px;
-  }
-  .msg {
-    margin: 8px 0;
-    padding: 10px 14px;
-    border-radius: 8px;
-    line-height: 1.6;
-    white-space: pre-wrap;
-    word-wrap: break-word;
-  }
-  .msg.user {
-    background: #1a3a5c;
-    color: #58a6ff;
-  }
-  .msg.assistant {
-    background: #161b22;
-    color: #c9d1d9;
-  }
-  .msg.assistant.analyst-msg {
-    border-left: 3px solid #3fb950;
-  }
-  .msg.event {
-    background: transparent;
-    color: #8b949e;
-    font-size: 11px;
-    padding: 4px 14px;
-    border-left: 3px solid #30363d;
-  }
-  .msg.event.loop {
-    border-left-color: #58a6ff;
-  }
-  .msg.event.tool {
-    border-left-color: #d29922;
-  }
-  .msg.event.stall {
-    border-left-color: #f85149;
-  }
-  .handoff-banner {
-    margin: 16px 0;
-    padding: 16px;
-    background: #1c1200;
-    border: 1px solid #d29922;
-    border-radius: 8px;
-    text-align: center;
-  }
-  .handoff-banner h3 {
-    color: #d29922;
-    font-size: 14px;
-    margin-bottom: 8px;
-  }
-  .handoff-banner p, .result-banner p {
-    color: #8b949e;
-    font-size: 12px;
-    line-height: 1.5;
-    max-height: 200px;
-    overflow-y: auto;
-    white-space: pre-wrap;
-    text-align: left;
-  }
-  .result-banner {
-    margin: 16px 0;
-    padding: 16px;
-    background: #0a2614;
-    border: 1px solid #3fb950;
-    border-radius: 8px;
-  }
-  .result-banner h3 {
-    color: #3fb950;
-    font-size: 14px;
-    margin-bottom: 8px;
-    text-align: center;
-  }
-  .result-banner .label {
-    color: #58a6ff;
-    font-size: 11px;
-    font-weight: 600;
-    margin-top: 10px;
-    margin-bottom: 2px;
-  }
-  .result-banner .tokens {
-    color: #484f58;
-    font-size: 11px;
-    text-align: center;
-    margin-top: 10px;
-  }
-  .input-bar {
-    padding: 12px 16px;
-    background: #161b22;
-    border-top: 1px solid #30363d;
-    display: flex;
-    gap: 8px;
-  }
-  .input-bar input {
-    flex: 1;
-    background: #0d1117;
-    border: 1px solid #30363d;
-    color: #c9d1d9;
-    padding: 8px 12px;
-    border-radius: 6px;
-    font-family: inherit;
-    font-size: 14px;
-    outline: none;
-  }
-  .input-bar input:focus {
-    border-color: #58a6ff;
-  }
-  .input-bar button {
-    background: #238636;
-    color: #fff;
-    border: none;
-    padding: 8px 20px;
-    border-radius: 6px;
-    cursor: pointer;
-    font-family: inherit;
-    font-weight: 600;
-  }
-  .input-bar button:hover {
-    background: #2ea043;
-  }
-  .input-bar button:disabled {
-    background: #21262d;
-    color: #484f58;
-    cursor: not-allowed;
-  }
-</style>
-</head>
-<body>
-  <header>
-    <h1>ContextHandoff Demo</h1>
-    <span id="phase" class="badge">Idle</span>
-    <span id="iter" class="badge" style="display:none">Step 0</span>
-  </header>
-  <div id="chat" class="chat"></div>
-  <div class="input-bar">
-    <input id="input" type="text"
-           placeholder="Enter a research topic..." autofocus />
-    <button id="go" onclick="run()">Research</button>
-  </div>
-
-<script>
-let ws = null;
-let currentAssistantEl = null;
-let iterCount = 0;
-let currentPhase = 'idle';
-const chat = document.getElementById('chat');
-const phase = document.getElementById('phase');
-const iterEl = document.getElementById('iter');
-const goBtn = document.getElementById('go');
-const inputEl = document.getElementById('input');
-
-inputEl.addEventListener('keydown', e => {
-  if (e.key === 'Enter') run();
-});
-
-function setPhase(text, cls) {
-  phase.textContent = text;
-  phase.className = 'badge ' + cls;
-  currentPhase = cls;
-}
-
-function addMsg(text, cls) {
-  const el = document.createElement('div');
-  el.className = 'msg ' + cls;
-  el.textContent = text;
-  chat.appendChild(el);
-  chat.scrollTop = chat.scrollHeight;
-  return el;
-}
-
-function addHandoffBanner(summary) {
-  const banner = document.createElement('div');
-  banner.className = 'handoff-banner';
-  const h3 = document.createElement('h3');
-  h3.textContent = 'Context Handoff: Researcher -> Analyst';
-  const p = document.createElement('p');
-  p.textContent = summary || 'Passing research context...';
-  banner.appendChild(h3);
-  banner.appendChild(p);
-  chat.appendChild(banner);
-  chat.scrollTop = chat.scrollHeight;
-}
-
-function addResultBanner(researcher, analyst, tokens) {
-  const banner = document.createElement('div');
-  banner.className = 'result-banner';
-  const h3 = document.createElement('h3');
-  h3.textContent = 'Pipeline Complete';
-  banner.appendChild(h3);
-
-  if (researcher && researcher.research_summary) {
-    const lbl = document.createElement('div');
-    lbl.className = 'label';
-    lbl.textContent = 'RESEARCH SUMMARY';
-    banner.appendChild(lbl);
-    const p = document.createElement('p');
-    p.textContent = researcher.research_summary;
-    banner.appendChild(p);
-  }
-
-  if (analyst && analyst.analysis) {
-    const lbl = document.createElement('div');
-    lbl.className = 'label';
-    lbl.textContent = 'ANALYSIS';
-    lbl.style.color = '#3fb950';
-    banner.appendChild(lbl);
-    const p = document.createElement('p');
-    p.textContent = analyst.analysis;
-    banner.appendChild(p);
-  }
-
-  if (tokens) {
-    const t = document.createElement('div');
-    t.className = 'tokens';
-    t.textContent = 'Total tokens: ' + tokens.toLocaleString();
-    banner.appendChild(t);
-  }
-
-  chat.appendChild(banner);
-  chat.scrollTop = chat.scrollHeight;
-}
-
-function connect() {
-  ws = new WebSocket('ws://' + location.host + '/ws');
-  ws.onopen = () => {
-    setPhase('Ready', 'done');
-    goBtn.disabled = false;
-  };
-  ws.onmessage = handleEvent;
-  ws.onerror = () => { setPhase('Error', 'error'); };
-  ws.onclose = () => {
-    setPhase('Reconnecting...', '');
-    goBtn.disabled = true;
-    setTimeout(connect, 2000);
-  };
-}
-
-function handleEvent(msg) {
-  const evt = JSON.parse(msg.data);
-
-  if (evt.type === 'phase') {
-    if (evt.phase === 'researcher') {
-      setPhase('Researcher', 'researcher');
-    } else if (evt.phase === 'handoff') {
-      setPhase('Handoff', 'handoff');
-    } else if (evt.phase === 'analyst') {
-      setPhase('Analyst', 'analyst');
-    }
-    iterCount = 0;
-    iterEl.style.display = 'none';
-  }
-  else if (evt.type === 'llm_text_delta') {
-    if (currentAssistantEl) {
-      currentAssistantEl.textContent += evt.content;
-      chat.scrollTop = chat.scrollHeight;
-    }
-  }
-  else if (evt.type === 'node_loop_iteration') {
-    iterCount = evt.iteration || (iterCount + 1);
-    iterEl.textContent = 'Step ' + iterCount;
-    iterEl.style.display = '';
-  }
-  else if (evt.type === 'tool_call_started') {
-    var info = evt.tool_name + '('
-      + JSON.stringify(evt.tool_input).slice(0, 120) + ')';
-    addMsg('TOOL  ' + info, 'event tool');
-  }
-  else if (evt.type === 'tool_call_completed') {
-    var preview = (evt.result || '').slice(0, 200);
-    var cls = evt.is_error ? 'stall' : 'tool';
-    addMsg(
-      'RESULT  ' + evt.tool_name + ': ' + preview,
-      'event ' + cls
-    );
-    var assistCls = currentPhase === 'analyst'
-      ? 'assistant analyst-msg' : 'assistant';
-    currentAssistantEl = addMsg('', assistCls);
-  }
-  else if (evt.type === 'handoff_context') {
-    addHandoffBanner(evt.summary);
-    var assistCls = 'assistant analyst-msg';
-    currentAssistantEl = addMsg('', assistCls);
-  }
-  else if (evt.type === 'node_result') {
-    if (evt.node_id === 'researcher') {
-      if (currentAssistantEl
-          && !currentAssistantEl.textContent) {
-        currentAssistantEl.remove();
-      }
-    }
-  }
-  else if (evt.type === 'done') {
-    setPhase('Done', 'done');
-    iterEl.style.display = 'none';
-    if (currentAssistantEl
-        && !currentAssistantEl.textContent) {
-      currentAssistantEl.remove();
-    }
-    currentAssistantEl = null;
-    addResultBanner(
-      evt.researcher, evt.analyst, evt.total_tokens
-    );
-    goBtn.disabled = false;
-    inputEl.placeholder = 'Enter another topic...';
-  }
-  else if (evt.type === 'error') {
-    setPhase('Error', 'error');
-    addMsg('ERROR  ' + evt.message, 'event stall');
-    goBtn.disabled = false;
-  }
-  else if (evt.type === 'node_stalled') {
-    addMsg('STALLED  ' + evt.reason, 'event stall');
-  }
-}
-
-function run() {
-  const text = inputEl.value.trim();
-  if (!text || !ws || ws.readyState !== 1) return;
-  chat.innerHTML = '';
-  addMsg(text, 'user');
-  currentAssistantEl = addMsg('', 'assistant');
-  inputEl.value = '';
-  goBtn.disabled = true;
-  ws.send(JSON.stringify({ topic: text }));
-}
-
-connect();
-</script>
-</body>
-</html>"""
-)
-
-
-# -------------------------------------------------------------------------
-# WebSocket handler — sequential Node A → Handoff → Node B
-# -------------------------------------------------------------------------
-
-
-async def handle_ws(websocket):
-    """Run the two-node handoff pipeline per user message."""
-    try:
-        async for raw in websocket:
-            try:
-                msg = json.loads(raw)
-            except Exception:
-                continue
-
-            topic = msg.get("topic", "")
-            if not topic:
-                continue
-
-            logger.info(f"Starting handoff pipeline for: {topic}")
-
-            try:
-                await _run_pipeline(websocket, topic)
-            except websockets.exceptions.ConnectionClosed:
-                logger.info("WebSocket closed during pipeline")
-                return
-            except Exception as e:
-                logger.exception("Pipeline error")
-                try:
-                    await websocket.send(json.dumps({"type": "error", "message": str(e)}))
-                except Exception:
-                    pass
-
-    except websockets.exceptions.ConnectionClosed:
-        pass
-
-
-async def _run_pipeline(websocket, topic: str):
-    """Execute: Node A (research) → ContextHandoff → Node B (analysis)."""
-    import shutil
-
-    # Fresh stores for each run
-    run_dir = Path(tempfile.mkdtemp(prefix="hive_run_", dir=STORE_DIR))
-    store_a = FileConversationStore(run_dir / "node_a")
-    store_b = FileConversationStore(run_dir / "node_b")
-
-    # Shared event bus
-    bus = EventBus()
-
-    async def forward_event(event):
-        try:
-            payload = {"type": event.type.value, **event.data}
-            if event.node_id:
-                payload["node_id"] = event.node_id
-            await websocket.send(json.dumps(payload))
-        except Exception:
-            pass
-
-    bus.subscribe(
-        event_types=[
-            EventType.NODE_LOOP_STARTED,
-            EventType.NODE_LOOP_ITERATION,
-            EventType.NODE_LOOP_COMPLETED,
-            EventType.LLM_TEXT_DELTA,
-            EventType.TOOL_CALL_STARTED,
-            EventType.TOOL_CALL_COMPLETED,
-            EventType.NODE_STALLED,
-        ],
-        handler=forward_event,
-    )
-
-    tools = list(TOOL_REGISTRY.get_tools().values())
-    tool_executor = TOOL_REGISTRY.get_executor()
-
-    # ---- Phase 1: Researcher ------------------------------------------------
-    await websocket.send(json.dumps({"type": "phase", "phase": "researcher"}))
-
-    node_a = EventLoopNode(
-        event_bus=bus,
-        judge=None,  # implicit judge: accept when output_keys filled
-        config=LoopConfig(
-            max_iterations=20,
-            max_tool_calls_per_turn=30,
-            max_history_tokens=32_000,
-        ),
-        conversation_store=store_a,
-        tool_executor=tool_executor,
-    )
-
-    ctx_a = NodeContext(
-        runtime=RUNTIME,
-        node_id="researcher",
-        node_spec=RESEARCHER_SPEC,
-        memory=SharedMemory(),
-        input_data={"topic": topic},
-        llm=LLM,
-        available_tools=tools,
-    )
-
-    result_a = await node_a.execute(ctx_a)
-    logger.info(
-        "Researcher done: success=%s, tokens=%s",
-        result_a.success,
-        result_a.tokens_used,
-    )
-
-    await websocket.send(
-        json.dumps(
-            {
-                "type": "node_result",
-                "node_id": "researcher",
-                "success": result_a.success,
-                "output": result_a.output,
-            }
-        )
-    )
-
-    if not result_a.success:
-        await websocket.send(
-            json.dumps(
-                {
-                    "type": "error",
-                    "message": f"Researcher failed: {result_a.error}",
-                }
-            )
-        )
-        return
-
-    # ---- Phase 2: Context Handoff -------------------------------------------
-    await websocket.send(json.dumps({"type": "phase", "phase": "handoff"}))
-
-    # Restore the researcher's conversation from store
-    conversation_a = await NodeConversation.restore(store_a)
-    if conversation_a is None:
-        await websocket.send(
-            json.dumps(
-                {
-                    "type": "error",
-                    "message": "Failed to restore researcher conversation",
-                }
-            )
-        )
-        return
-
-    handoff_engine = ContextHandoff(llm=LLM)
-    handoff_context = handoff_engine.summarize_conversation(
-        conversation=conversation_a,
-        node_id="researcher",
-        output_keys=["research_summary"],
-    )
-
-    formatted_handoff = ContextHandoff.format_as_input(handoff_context)
-    logger.info(
-        "Handoff: %d turns, ~%d tokens, keys=%s",
-        handoff_context.turn_count,
-        handoff_context.total_tokens_used,
-        list(handoff_context.key_outputs.keys()),
-    )
-
-    # Send handoff context to browser
-    await websocket.send(
-        json.dumps(
-            {
-                "type": "handoff_context",
-                "summary": handoff_context.summary[:500],
-                "turn_count": handoff_context.turn_count,
-                "tokens": handoff_context.total_tokens_used,
-                "key_outputs": handoff_context.key_outputs,
-            }
-        )
-    )
-
-    # ---- Phase 3: Analyst ---------------------------------------------------
-    await websocket.send(json.dumps({"type": "phase", "phase": "analyst"}))
-
-    node_b = EventLoopNode(
-        event_bus=bus,
-        judge=None,  # implicit judge
-        config=LoopConfig(
-            max_iterations=10,
-            max_tool_calls_per_turn=30,
-            max_history_tokens=32_000,
-        ),
-        conversation_store=store_b,
-    )
-
-    ctx_b = NodeContext(
-        runtime=RUNTIME,
-        node_id="analyst",
-        node_spec=ANALYST_SPEC,
-        memory=SharedMemory(),
-        input_data={"context": formatted_handoff},
-        llm=LLM,
-        available_tools=[],
-    )
-
-    result_b = await node_b.execute(ctx_b)
-    logger.info(
-        "Analyst done: success=%s, tokens=%s",
-        result_b.success,
-        result_b.tokens_used,
-    )
-
-    # ---- Done ---------------------------------------------------------------
-    await websocket.send(
-        json.dumps(
-            {
-                "type": "done",
-                "researcher": result_a.output,
-                "analyst": result_b.output,
-                "total_tokens": ((result_a.tokens_used or 0) + (result_b.tokens_used or 0)),
-            }
-        )
-    )
-
-    # Clean up temp stores
-    try:
-        shutil.rmtree(run_dir)
-    except Exception:
-        pass
-
-
-# -------------------------------------------------------------------------
-# HTTP handler
-# -------------------------------------------------------------------------
-
-
-async def process_request(connection, request: Request):
-    """Serve HTML on GET /, upgrade to WebSocket on /ws."""
-    if request.path == "/ws":
-        return None
-    return Response(
-        HTTPStatus.OK,
-        "OK",
-        websockets.Headers({"Content-Type": "text/html; charset=utf-8"}),
-        HTML_PAGE.encode(),
-    )
-
-
-# -------------------------------------------------------------------------
-# Main
-# -------------------------------------------------------------------------
-
-
-async def main():
-    port = 8766
-    async with websockets.serve(
-        handle_ws,
-        "0.0.0.0",
-        port,
-        process_request=process_request,
-    ):
-        logger.info(f"Handoff demo at http://localhost:{port}")
-        logger.info("Enter a research topic to start the pipeline.")
-        await asyncio.Future()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
@@ -1,132 +0,0 @@
-"""
-Minimal Manual Agent Example
----------------------------
-This example demonstrates how to build and run an agent programmatically
-without using the Claude Code CLI or external LLM APIs.
-
-It uses custom NodeProtocol implementations to define logic in pure Python,
-making it perfect for understanding the core runtime loop:
-Setup -> Graph definition -> Execution -> Result
-
-Run with:
-    uv run python core/examples/manual_agent.py
-"""
-
-import asyncio
-
-from framework.graph import EdgeCondition, EdgeSpec, Goal, GraphSpec, NodeSpec
-from framework.graph.executor import GraphExecutor
-from framework.graph.node import NodeContext, NodeProtocol, NodeResult
-from framework.runtime.core import Runtime
-
-
-# 1. Define Node Logic (Custom NodeProtocol implementations)
-class GreeterNode(NodeProtocol):
-    """Generate a simple greeting."""
-
-    async def execute(self, ctx: NodeContext) -> NodeResult:
-        name = ctx.input_data.get("name", "World")
-        greeting = f"Hello, {name}!"
-        ctx.memory.write("greeting", greeting)
-        return NodeResult(success=True, output={"greeting": greeting})
-
-
-class UppercaserNode(NodeProtocol):
-    """Convert text to uppercase."""
-
-    async def execute(self, ctx: NodeContext) -> NodeResult:
-        greeting = ctx.input_data.get("greeting") or ctx.memory.read("greeting") or ""
-        result = greeting.upper()
-        ctx.memory.write("final_greeting", result)
-        return NodeResult(success=True, output={"final_greeting": result})
-
-
-async def main():
-    print("Setting up Manual Agent...")
-
-    # 2. Define the Goal
-    # Every agent needs a goal with success criteria
-    goal = Goal(
-        id="greet-user",
-        name="Greet User",
-        description="Generate a friendly uppercase greeting",
-        success_criteria=[
-            {
-                "id": "greeting_generated",
-                "description": "Greeting produced",
-                "metric": "custom",
-                "target": "any",
-            }
-        ],
-    )
-
-    # 3. Define Nodes
-    # Nodes describe steps in the process
-    node1 = NodeSpec(
-        id="greeter",
-        name="Greeter",
-        description="Generates a simple greeting",
-        node_type="event_loop",
-        input_keys=["name"],
-        output_keys=["greeting"],
-    )
-
-    node2 = NodeSpec(
-        id="uppercaser",
-        name="Uppercaser",
-        description="Converts greeting to uppercase",
-        node_type="event_loop",
-        input_keys=["greeting"],
-        output_keys=["final_greeting"],
-    )
-
-    # 4. Define Edges
-    # Edges define the flow between nodes
-    edge1 = EdgeSpec(
-        id="greet-to-upper",
-        source="greeter",
-        target="uppercaser",
-        condition=EdgeCondition.ON_SUCCESS,
-    )
-
-    # 5. Create Graph
-    # The graph works like a blueprint connecting nodes and edges
-    graph = GraphSpec(
-        id="greeting-agent",
-        goal_id="greet-user",
-        entry_node="greeter",
-        terminal_nodes=["uppercaser"],
-        nodes=[node1, node2],
-        edges=[edge1],
-    )
-
-    # 6. Initialize Runtime & Executor
-    # Runtime handles state/memory; Executor runs the graph
-    from pathlib import Path
-
-    runtime = Runtime(storage_path=Path("./agent_logs"))
-    executor = GraphExecutor(runtime=runtime)
-
-    # 7. Register Node Implementations
-    # Connect node IDs in the graph to actual Python implementations
-    executor.register_node("greeter", GreeterNode())
-    executor.register_node("uppercaser", UppercaserNode())
-
-    # 8. Execute Agent
-    print("Executing agent with input: name='Alice'...")
-
-    result = await executor.execute(graph=graph, goal=goal, input_data={"name": "Alice"})
-
-    # 9. Verify Results
-    if result.success:
-        print("\nSuccess!")
-        print(f"Path taken: {' -> '.join(result.path)}")
-        print(f"Final output: {result.output.get('final_greeting')}")
-    else:
-        print(f"\nFailed: {result.error}")
-
-
-if __name__ == "__main__":
-    # Optional: Enable logging to see internal decision flow
-    # logging.basicConfig(level=logging.INFO)
-    asyncio.run(main())
@@ -1,194 +0,0 @@
-#!/usr/bin/env python3
-"""
-Example: Integrating MCP Servers with the Core Framework
-
-This example demonstrates how to:
-1. Register MCP servers programmatically
-2. Use MCP tools in agents
-3. Load MCP servers from configuration files
-"""
-
-import asyncio
-from pathlib import Path
-
-from framework.runner.runner import AgentRunner
-
-
-async def example_1_programmatic_registration():
-    """Example 1: Register MCP server programmatically"""
-    print("\n=== Example 1: Programmatic MCP Server Registration ===\n")
-
-    # Load an existing agent
-    runner = AgentRunner.load("exports/task-planner")
-
-    # Register tools MCP server via STDIO
-    num_tools = runner.register_mcp_server(
-        name="tools",
-        transport="stdio",
-        command="python",
-        args=["-m", "aden_tools.mcp_server", "--stdio"],
-        cwd="../tools",
-    )
-
-    print(f"Registered {num_tools} tools from tools MCP server")
-
-    # List all available tools
-    tools = runner._tool_registry.get_tools()
-    print(f"\nAvailable tools: {list(tools.keys())}")
-
-    # Run the agent with MCP tools available
-    result = await runner.run(
-        {"objective": "Search for 'Claude AI' and summarize the top 3 results"}
-    )
-
-    print(f"\nAgent result: {result}")
-
-    # Cleanup
-    runner.cleanup()
-
-
-async def example_2_http_transport():
-    """Example 2: Connect to MCP server via HTTP"""
-    print("\n=== Example 2: HTTP MCP Server Connection ===\n")
-
-    # First, start the tools MCP server in HTTP mode:
-    # cd tools && python mcp_server.py --port 4001
-
-    runner = AgentRunner.load("exports/task-planner")
-
-    # Register tools via HTTP
-    num_tools = runner.register_mcp_server(
-        name="tools-http",
-        transport="http",
-        url="http://localhost:4001",
-    )
-
-    print(f"Registered {num_tools} tools from HTTP MCP server")
-
-    # Cleanup
-    runner.cleanup()
-
-
-async def example_3_config_file():
-    """Example 3: Load MCP servers from configuration file"""
-    print("\n=== Example 3: Load from Configuration File ===\n")
-
-    # Create a test agent folder with mcp_servers.json
-    test_agent_path = Path("exports/task-planner")
-
-    # Copy example config (in practice, you'd place this in your agent folder)
-    import shutil
-
-    shutil.copy("examples/mcp_servers.json", test_agent_path / "mcp_servers.json")
-
-    # Load agent - MCP servers will be auto-discovered
-    runner = AgentRunner.load(test_agent_path)
-
-    # Tools are automatically available
-    tools = runner._tool_registry.get_tools()
-    print(f"Available tools: {list(tools.keys())}")
-
-    # Cleanup
-    runner.cleanup()
-
-    # Clean up the test config
-    (test_agent_path / "mcp_servers.json").unlink()
-
-
-async def example_4_custom_agent_with_mcp_tools():
-    """Example 4: Build custom agent that uses MCP tools"""
-    print("\n=== Example 4: Custom Agent with MCP Tools ===\n")
-
-    from framework.builder.workflow import GraphBuilder
-
-    # Create a workflow builder
-    builder = GraphBuilder()
-
-    # Define goal
-    builder.set_goal(
-        goal_id="web-researcher",
-        name="Web Research Agent",
-        description="Search the web and summarize findings",
-    )
-
-    # Add success criteria
-    builder.add_success_criterion(
-        "search-results", "Successfully retrieve at least 3 web search results"
-    )
-    builder.add_success_criterion("summary", "Provide a clear, concise summary of the findings")
-
-    # Add nodes that will use MCP tools
-    builder.add_node(
-        node_id="web-searcher",
-        name="Web Search",
-        description="Search the web for information",
-        node_type="event_loop",
-        system_prompt="Search for {query} and return the top results. Use the web_search tool.",
-        tools=["web_search"],  # This tool comes from tools MCP server
-        input_keys=["query"],
-        output_keys=["search_results"],
-    )
-
-    builder.add_node(
-        node_id="summarizer",
-        name="Summarize Results",
-        description="Summarize the search results",
-        node_type="event_loop",
-        system_prompt="Summarize the following search results in 2-3 sentences: {search_results}",
-        input_keys=["search_results"],
-        output_keys=["summary"],
-    )
-
-    # Connect nodes
-    builder.add_edge("web-searcher", "summarizer")
-
-    # Set entry point
-    builder.set_entry("web-searcher")
-    builder.set_terminal("summarizer")
-
-    # Export the agent
-    export_path = Path("exports/web-research-agent")
-    export_path.mkdir(parents=True, exist_ok=True)
-    builder.export(export_path)
-
-    # Load and register MCP server
-    runner = AgentRunner.load(export_path)
-    runner.register_mcp_server(
-        name="tools",
-        transport="stdio",
-        command="python",
-        args=["-m", "aden_tools.mcp_server", "--stdio"],
-        cwd="../tools",
-    )
-
-    # Run the agent
-    result = await runner.run({"query": "latest AI breakthroughs 2026"})
-
-    print(f"\nAgent completed with result:\n{result}")
-
-    # Cleanup
-    runner.cleanup()
-
-
-async def main():
-    """Run all examples"""
-    print("=" * 60)
-    print("MCP Integration Examples")
-    print("=" * 60)
-
-    try:
-        # Run examples
-        await example_1_programmatic_registration()
-        # await example_2_http_transport()  # Requires HTTP server running
-        # await example_3_config_file()
-        # await example_4_custom_agent_with_mcp_tools()
-
-    except Exception as e:
-        print(f"\nError running example: {e}")
-        import traceback
-
-        traceback.print_exc()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
@@ -1,70 +1,20 @@
-"""
-Aden Hive Framework: A goal-driven agent runtime optimized for Builder observability.
+"""Hive Agent Framework.

-The runtime is designed around DECISIONS, not just actions. Every significant
-choice the agent makes is captured with:
- What it was trying to do (intent)
- What options it considered
- What it chose and why
- What happened as a result
- Whether that was good or bad (evaluated post-hoc)
-
-This gives the Builder LLM the information it needs to improve agent behavior.
-
-## Testing Framework
-
-The framework includes a Goal-Based Testing system (Goal → Agent → Eval):
- Generate tests from Goal success_criteria and constraints
- Mandatory user approval before tests are stored
- Parallel test execution with error categorization
- Debug tools with fix suggestions
-
-See `framework.testing` for details.
+Core classes:
+    ColonyRuntime -- orchestrates parallel worker clones in a colony
+    AgentLoop      -- the LLM + tool execution loop (one per worker)
+    AgentLoader    -- loads agent config from disk, builds pipeline
+    DecisionTracker -- records decisions for post-hoc analysis
 """

-from framework.builder.query import BuilderQuery
-from framework.llm import AnthropicProvider, LLMProvider
-from framework.runner import AgentOrchestrator, AgentRunner
-from framework.runtime.core import Runtime
-from framework.schemas.decision import Decision, DecisionEvaluation, Option, Outcome
-from framework.schemas.run import Problem, Run, RunSummary
-
-# Testing framework
-from framework.testing import (
-    ApprovalStatus,
-    DebugTool,
-    ErrorCategory,
-    Test,
-    TestResult,
-    TestStorage,
-    TestSuiteResult,
-)
+from framework.agent_loop import AgentLoop
+from framework.host import ColonyRuntime
+from framework.loader import AgentLoader
+from framework.tracker import DecisionTracker

 __all__ = [
-    # Schemas
-    "Decision",
-    "Option",
-    "Outcome",
-    "DecisionEvaluation",
-    "Run",
-    "RunSummary",
-    "Problem",
-    # Runtime
-    "Runtime",
-    # Builder
-    "BuilderQuery",
-    # LLM
-    "LLMProvider",
-    "AnthropicProvider",
-    # Runner
-    "AgentRunner",
-    "AgentOrchestrator",
-    # Testing
-    "Test",
-    "TestResult",
-    "TestSuiteResult",
-    "TestStorage",
-    "ApprovalStatus",
-    "ErrorCategory",
-    "DebugTool",
+    "ColonyRuntime",
+    "AgentLoader",
+    "AgentLoop",
+    "DecisionTracker",
 ]
@@ -0,0 +1,34 @@
+"""Agent loop -- the core agent execution primitive."""
+
+from framework.agent_loop.conversation import (  # noqa: F401
+    ConversationStore,
+    Message,
+    NodeConversation,
+)
+from framework.agent_loop.types import (  # noqa: F401
+    AgentContext,
+    AgentProtocol,
+    AgentResult,
+    AgentSpec,
+)
+
+
+def __getattr__(name: str):
+    if name in ("AgentLoop", "JudgeProtocol", "JudgeVerdict", "LoopConfig", "OutputAccumulator"):
+        from framework.agent_loop.agent_loop import (
+            AgentLoop,
+            JudgeProtocol,
+            JudgeVerdict,
+            LoopConfig,
+            OutputAccumulator,
+        )
+
+        _exports = {
+            "AgentLoop": AgentLoop,
+            "JudgeProtocol": JudgeProtocol,
+            "JudgeVerdict": JudgeVerdict,
+            "LoopConfig": LoopConfig,
+            "OutputAccumulator": OutputAccumulator,
+        }
+        return _exports[name]
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -3,11 +3,20 @@
 from __future__ import annotations

 import json
+import logging
 import re
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Literal, Protocol, runtime_checkable

+LEGACY_RUN_ID = "__legacy_run__"
+logger = logging.getLogger(__name__)
+
+
+def is_legacy_run_id(run_id: str | None) -> bool:
+    """True when run_id represents pre-migration (no run boundary) data."""
+    return run_id is None or run_id == LEGACY_RUN_ID
+

@dataclass
 class Message:
@@ -33,20 +42,44 @@ class Message:
    is_transition_marker: bool = False
    # True when this message is real human input (from /chat), not a system prompt
    is_client_input: bool = False
+    # Optional image content blocks (e.g. from browser_screenshot)
+    image_content: list[dict[str, Any]] | None = None
+    # True when message contains an activated skill body (AS-10: never prune)
+    is_skill_content: bool = False
+    # Logical worker run identifier for shared-session persistence
+    run_id: str | None = None

    def to_llm_dict(self) -> dict[str, Any]:
        """Convert to OpenAI-format message dict."""
        if self.role == "user":
+            if self.image_content:
+                blocks: list[dict[str, Any]] = []
+                if self.content:
+                    blocks.append({"type": "text", "text": self.content})
+                blocks.extend(self.image_content)
+                return {"role": "user", "content": blocks}
            return {"role": "user", "content": self.content}

        if self.role == "assistant":
-            d: dict[str, Any] = {"role": "assistant", "content": self.content}
+            d: dict[str, Any] = {"role": "assistant"}
            if self.tool_calls:
                d["tool_calls"] = self.tool_calls
+                d["content"] = self.content if self.content else None
+            else:
+                d["content"] = self.content or ""
            return d

        # role == "tool"
        content = f"ERROR: {self.content}" if self.is_error else self.content
+        if self.image_content:
+            # Multimodal tool result: text + image content blocks
+            blocks: list[dict[str, Any]] = [{"type": "text", "text": content}]
+            blocks.extend(self.image_content)
+            return {
+                "role": "tool",
+                "tool_call_id": self.tool_use_id,
+                "content": blocks,
+            }
        return {
            "role": "tool",
            "tool_call_id": self.tool_use_id,
@@ -72,6 +105,10 @@ class Message:
            d["is_transition_marker"] = self.is_transition_marker
        if self.is_client_input:
            d["is_client_input"] = self.is_client_input
+        if self.image_content is not None:
+            d["image_content"] = self.image_content
+        if self.run_id is not None:
+            d["run_id"] = self.run_id
        return d

    @classmethod
@@ -87,16 +124,55 @@ class Message:
            phase_id=data.get("phase_id"),
            is_transition_marker=data.get("is_transition_marker", False),
            is_client_input=data.get("is_client_input", False),
+            image_content=data.get("image_content"),
+            run_id=data.get("run_id"),
        )


+def _normalize_cursor(cursor: dict[str, Any] | None) -> dict[str, Any]:
+    """Normalize legacy and run-scoped cursor formats into one flat shape."""
+    return dict(cursor) if cursor else {}
+
+
+def get_cursor_next_seq(cursor: dict[str, Any] | None) -> int | None:
+    next_seq = (cursor or {}).get("next_seq")
+    return next_seq if isinstance(next_seq, int) else None
+
+
+def update_cursor_next_seq(cursor: dict[str, Any] | None, next_seq: int) -> dict[str, Any]:
+    updated = dict(cursor or {})
+    updated["next_seq"] = next_seq
+    return updated
+
+
+def get_run_cursor(cursor: dict[str, Any] | None, run_id: str | None) -> dict[str, Any] | None:
+    return dict(cursor) if cursor else None
+
+
+def update_run_cursor(
+    cursor: dict[str, Any] | None,
+    run_id: str | None,
+    values: dict[str, Any],
+) -> dict[str, Any]:
+    updated = dict(cursor or {})
+    updated.update(values)
+    return updated
+
+
 def _extract_spillover_filename(content: str) -> str | None:
    """Extract spillover filename from a tool result annotation.

-    Matches patterns produced by EventLoopNode._truncate_tool_result():
-        - Large result:  "saved to 'web_search_1.txt'"
-        - Small result:  "[Saved to 'web_search_1.txt']"
+    Matches patterns produced by ``truncate_tool_result``:
+        - New large-result header: "Full result saved at: /abs/path/file.txt"
+        - Legacy bracketed trailer: "[Saved to 'file.txt']"  (pre-2026-04-15,
+          retained here so cold conversations still resolve)
    """
+    # New prose format — ``saved at: <absolute path>``, terminated by
+    # newline or end-of-string.
+    match = re.search(r"[Ss]aved at:\s*(\S+)", content)
+    if match:
+        return match.group(1)
+    # Legacy format.
    match = re.search(r"[Ss]aved to '([^']+)'", content)
    return match.group(1) if match else None

@@ -169,8 +245,8 @@ def extract_tool_call_history(messages: list[Message], max_entries: int = 30) ->
            return args.get("query", "")
        if name == "web_scrape":
            return args.get("url", "")
-        if name in ("load_data", "save_data"):
-            return args.get("filename", "")
+        if name == "read_file":
+            return args.get("path", "")
        return ""

    for msg in messages:
@@ -186,8 +262,8 @@ def extract_tool_call_history(messages: list[Message], max_entries: int = 30) ->
                summary = _summarize_input(name, args)
                tool_calls_detail.setdefault(name, []).append(summary)

-                if name == "save_data" and args.get("filename"):
-                    files_saved.append(args["filename"])
+                if name == "read_file" and args.get("path"):
+                    files_saved.append(args["path"])
                if name == "set_output" and args.get("key"):
                    outputs_set.append(args["key"])

@@ -239,7 +315,7 @@ class ConversationStore(Protocol):

    async def read_cursor(self) -> dict[str, Any] | None: ...

-    async def delete_parts_before(self, seq: int) -> None: ...
+    async def delete_parts_before(self, seq: int, run_id: str | None = None) -> None: ...

    async def close(self) -> None: ...

@@ -260,7 +336,7 @@ def _try_extract_key(content: str, key: str) -> str | None:
    3. Colon format: ``key: value``.
    4. Equals format: ``key = value``.
    """
-    from framework.graph.node import find_json_object
+    from framework.orchestrator.node import find_json_object

    # 1. Whole message is JSON
    try:
@@ -307,14 +383,25 @@ class NodeConversation:
    def __init__(
        self,
        system_prompt: str = "",
-        max_history_tokens: int = 32000,
+        max_context_tokens: int = 32000,
        compaction_threshold: float = 0.8,
        output_keys: list[str] | None = None,
        store: ConversationStore | None = None,
+        run_id: str | None = None,
+        compaction_buffer_tokens: int | None = None,
+        compaction_warning_buffer_tokens: int | None = None,
    ) -> None:
        self._system_prompt = system_prompt
-        self._max_history_tokens = max_history_tokens
+        self._max_context_tokens = max_context_tokens
        self._compaction_threshold = compaction_threshold
+        # Buffer-based compaction trigger (Gap 7). When set, takes
+        # precedence over the multiplicative compaction_threshold so the
+        # loop reserves a fixed headroom for the next turn's input+output
+        # instead of trying to get exactly X% of the way to the hard
+        # limit. If left as None the legacy threshold-based rule is
+        # used, keeping old call sites behaving identically.
+        self._compaction_buffer_tokens = compaction_buffer_tokens
+        self._compaction_warning_buffer_tokens = compaction_warning_buffer_tokens
        self._output_keys = output_keys
        self._store = store
        self._messages: list[Message] = []
@@ -322,6 +409,7 @@ class NodeConversation:
        self._meta_persisted: bool = False
        self._last_api_input_tokens: int | None = None
        self._current_phase: str | None = None
+        self._run_id: str | None = run_id

    # --- Properties --------------------------------------------------------

@@ -373,17 +461,23 @@ class NodeConversation:
        *,
        is_transition_marker: bool = False,
        is_client_input: bool = False,
+        image_content: list[dict[str, Any]] | None = None,
    ) -> Message:
        msg = Message(
            seq=self._next_seq,
            role="user",
            content=content,
            phase_id=self._current_phase,
+            run_id=self._run_id,
            is_transition_marker=is_transition_marker,
            is_client_input=is_client_input,
+            image_content=image_content,
        )
        self._messages.append(msg)
        self._next_seq += 1
+        # Invalidate stale API token count so estimate_tokens() uses
+        # the char-based heuristic which reflects the new message.
+        self._last_api_input_tokens = None
        await self._persist(msg)
        return msg

@@ -398,9 +492,11 @@ class NodeConversation:
            content=content,
            tool_calls=tool_calls,
            phase_id=self._current_phase,
+            run_id=self._run_id,
        )
        self._messages.append(msg)
        self._next_seq += 1
+        self._last_api_input_tokens = None
        await self._persist(msg)
        return msg

@@ -409,7 +505,30 @@ class NodeConversation:
        tool_use_id: str,
        content: str,
        is_error: bool = False,
+        image_content: list[dict[str, Any]] | None = None,
+        is_skill_content: bool = False,
    ) -> Message:
+        # Dedup guard: reject a second tool_result for the same tool_use_id.
+        # Anthropic's API only accepts one result per tool_call, and a duplicate
+        # causes a hard 400 two turns later ("messages with role 'tool' must
+        # be a response to a preceding message with 'tool_calls'"). Duplicates
+        # can arise when a tool_call_timeout fires and records a placeholder
+        # error, then the real executor thread eventually delivers the actual
+        # result (the thread kept running inside run_in_executor — see
+        # tool_result_handler.execute_tool).  We keep the FIRST result to
+        # preserve whatever state the agent already reasoned about.
+        for existing in reversed(self._messages):
+            if existing.role == "tool" and existing.tool_use_id == tool_use_id:
+                import logging as _logging
+
+                _logging.getLogger(__name__).warning(
+                    "add_tool_result: dropping duplicate result for tool_use_id=%s "
+                    "(first result preserved, %d chars; new result ignored, %d chars)",
+                    tool_use_id,
+                    len(existing.content),
+                    len(content),
+                )
+                return existing
        msg = Message(
            seq=self._next_seq,
            role="tool",
@@ -417,9 +536,13 @@ class NodeConversation:
            tool_use_id=tool_use_id,
            is_error=is_error,
            phase_id=self._current_phase,
+            image_content=image_content,
+            is_skill_content=is_skill_content,
+            run_id=self._run_id,
        )
        self._messages.append(msg)
        self._next_seq += 1
+        self._last_api_input_tokens = None
        await self._persist(msg)
        return msg

@@ -433,7 +556,48 @@ class NodeConversation:
        can happen when a loop is cancelled mid-tool-execution.
        """
        msgs = [m.to_llm_dict() for m in self._messages]
-        return self._repair_orphaned_tool_calls(msgs)
+        msgs = self._repair_orphaned_tool_calls(msgs)
+        msgs = self._sanitize_for_api(msgs)
+        return msgs
+
+    @staticmethod
+    def _sanitize_for_api(msgs: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        """Final pass: ensure message sequence is valid for strict APIs.
+
+        Rules:
+        1. No two consecutive messages with the same role (merge or drop)
+        2. Tool messages must have a tool_call_id
+        3. Assistant messages with tool_calls must have content=null, not ""
+        4. First message must not be 'tool' or 'assistant' (without prior context)
+        """
+        cleaned: list[dict[str, Any]] = []
+        for m in msgs:
+            role = m.get("role")
+
+            # Fix assistant content when tool_calls present
+            if role == "assistant" and m.get("tool_calls"):
+                if m.get("content") == "":
+                    m["content"] = None
+
+            # Drop tool messages without tool_call_id
+            if role == "tool" and not m.get("tool_call_id"):
+                continue
+
+            # Drop consecutive duplicate roles (merge user messages)
+            if cleaned and cleaned[-1].get("role") == role == "user":
+                prev_content = cleaned[-1].get("content", "")
+                curr_content = m.get("content", "")
+                if isinstance(prev_content, str) and isinstance(curr_content, str):
+                    cleaned[-1]["content"] = f"{prev_content}\n{curr_content}"
+                    continue
+
+            cleaned.append(m)
+
+        # Drop leading assistant/tool messages (no prior context)
+        while cleaned and cleaned[0].get("role") in ("assistant", "tool"):
+            cleaned.pop(0)
+
+        return cleaned

    @staticmethod
    def _repair_orphaned_tool_calls(
@@ -441,11 +605,18 @@ class NodeConversation:
    ) -> list[dict[str, Any]]:
        """Ensure tool_call / tool_result pairs are consistent.

-        1. **Orphaned tool results** (tool_result with no preceding tool_use)
-           are dropped.  This happens when compaction removes an assistant
-           message but leaves its tool-result messages behind.
-        2. **Orphaned tool calls** (tool_use with no following tool_result)
-           get a synthetic error result appended.  This happens when a loop
+        1. **Orphaned tool results** (tool_result with no matching tool_use
+           anywhere) are dropped.  Happens after compaction removes the
+           parent assistant message.
+        2. **Positionally orphaned tool results** (tool_result separated
+           from its parent by a non-tool message, e.g. a user injection)
+           are dropped.  The Anthropic API requires tool messages to
+           follow immediately after the assistant message that issued
+           the matching tool_call.
+        3. **Duplicate tool results** (same tool_call_id appearing more
+           than once) are dropped; only the first is kept.
+        4. **Orphaned tool calls** (tool_use with no following tool_result)
+           get a synthetic error result appended.  Happens when the loop
           is cancelled mid-tool-execution.
        """
        # Pass 1: collect all tool_call IDs from assistant messages so we
@@ -458,41 +629,75 @@ class NodeConversation:
                    if tc_id:
                        all_tool_call_ids.add(tc_id)

-        # Pass 2: build repaired list — drop orphaned tool results, patch
-        # missing tool results.
+        # Pass 2: build repaired list — drop orphaned tool results, drop
+        # positional orphans and duplicates, patch missing tool results.
+        #
+        # ``open_tool_calls`` holds the tool_call IDs we're still expecting
+        # results for: it's populated when we emit an assistant-with-tool_calls
+        # and drained as matching tool messages follow. Any tool message
+        # whose id is not currently open is positionally invalid and gets
+        # dropped — that closes the gap that caused the tool-after-user
+        # 400 errors.
        repaired: list[dict[str, Any]] = []
-        for i, m in enumerate(msgs):
-            # Drop tool-result messages whose tool_call_id has no matching
-            # tool_use in any assistant message (orphaned by compaction).
-            if m.get("role") == "tool":
-                tid = m.get("tool_call_id")
-                if tid and tid not in all_tool_call_ids:
-                    continue  # skip orphaned result
+        open_tool_calls: set[str] = set()
+        seen_tool_ids: set[str] = set()
+        for m in msgs:
+            role = m.get("role")

-            repaired.append(m)
-            tool_calls = m.get("tool_calls")
-            if m.get("role") != "assistant" or not tool_calls:
+            if role == "tool":
+                tid = m.get("tool_call_id")
+                # Drop tool results with no matching tool_use anywhere.
+                if not tid or tid not in all_tool_call_ids:
+                    continue
+                # Drop duplicates (same id appearing twice) — keep first.
+                if tid in seen_tool_ids:
+                    continue
+                # Drop positional orphans — tool messages whose parent
+                # assistant isn't the still-open assistant block.
+                if tid not in open_tool_calls:
+                    continue
+                open_tool_calls.discard(tid)
+                seen_tool_ids.add(tid)
+                repaired.append(m)
                continue
-            # Collect IDs of tool results that follow this assistant message
-            answered: set[str] = set()
-            for j in range(i + 1, len(msgs)):
-                if msgs[j].get("role") == "tool":
-                    tid = msgs[j].get("tool_call_id")
-                    if tid:
-                        answered.add(tid)
-                else:
-                    break  # stop at first non-tool message
-            # Patch any missing results
-            for tc in tool_calls:
-                tc_id = tc.get("id")
-                if tc_id and tc_id not in answered:
+
+            # Any non-tool message closes the current assistant tool block.
+            # If the previous assistant left tool_calls unanswered, patch
+            # synthetic error results before emitting this message so the
+            # API sees a complete pairing.
+            if open_tool_calls:
+                for stale_id in list(open_tool_calls):
                    repaired.append(
                        {
                            "role": "tool",
-                            "tool_call_id": tc_id,
+                            "tool_call_id": stale_id,
                            "content": "ERROR: Tool execution was interrupted.",
                        }
                    )
+                    seen_tool_ids.add(stale_id)
+                open_tool_calls.clear()
+
+            repaired.append(m)
+
+            if role == "assistant":
+                for tc in m.get("tool_calls") or []:
+                    tc_id = tc.get("id")
+                    if tc_id and tc_id not in seen_tool_ids:
+                        open_tool_calls.add(tc_id)
+
+        # Tail: if the conversation ends with an assistant that issued
+        # tool_calls and no results followed, patch them so the next
+        # turn's first message can be a valid assistant/user response.
+        if open_tool_calls:
+            for stale_id in list(open_tool_calls):
+                repaired.append(
+                    {
+                        "role": "tool",
+                        "tool_call_id": stale_id,
+                        "content": "ERROR: Tool execution was interrupted.",
+                    }
+                )
+
        return repaired

    def estimate_tokens(self) -> int:
@@ -500,12 +705,15 @@ class NodeConversation:

        Uses actual API input token count when available (set via
        :meth:`update_token_count`), otherwise falls back to a
-        ``total_chars / 4`` heuristic that includes both message content
-        AND tool_call argument sizes.
+        character-based heuristic that includes message content, tool_call
+        arguments, and image blocks.  The heuristic applies a 4/3 safety
+        margin to avoid under-counting (inspired by Claude Code's compact
+        service).
        """
        if self._last_api_input_tokens is not None:
            return self._last_api_input_tokens
        total_chars = 0
+        image_tokens = 0
        for m in self._messages:
            total_chars += len(m.content)
            if m.tool_calls:
@@ -513,7 +721,11 @@ class NodeConversation:
                    func = tc.get("function", {})
                    total_chars += len(func.get("arguments", ""))
                    total_chars += len(func.get("name", ""))
-        return total_chars // 4
+            if m.image_content:
+                # Images/documents have a fixed token cost per block
+                image_tokens += len(m.image_content) * 2000
+        # Apply 4/3 safety margin to character-based estimate
+        return (total_chars * 4) // (3 * 4) + image_tokens

    def update_token_count(self, actual_input_tokens: int) -> None:
        """Store actual API input token count for more accurate compaction.
@@ -525,16 +737,45 @@ class NodeConversation:
        self._last_api_input_tokens = actual_input_tokens

    def usage_ratio(self) -> float:
-        """Current token usage as a fraction of *max_history_tokens*.
+        """Current token usage as a fraction of *max_context_tokens*.

-        Returns 0.0 when ``max_history_tokens`` is zero (unlimited).
+        Returns 0.0 when ``max_context_tokens`` is zero (unlimited).
        """
-        if self._max_history_tokens <= 0:
+        if self._max_context_tokens <= 0:
            return 0.0
-        return self.estimate_tokens() / self._max_history_tokens
+        return self.estimate_tokens() / self._max_context_tokens

    def needs_compaction(self) -> bool:
-        return self.estimate_tokens() >= self._max_history_tokens * self._compaction_threshold
+        """True when the conversation should be compacted before the
+        next LLM call.
+
+        Buffer-based rule (Gap 7): trigger when the current estimate
+        plus the configured buffer would exceed the hard context limit.
+        Prevents compaction from firing only AFTER we're already over
+        the wire and forced into a reactive binary-split pass.
+
+        When no buffer is configured, falls back to the multiplicative
+        threshold the old callers were built around.
+        """
+        if self._max_context_tokens <= 0:
+            return False
+        if self._compaction_buffer_tokens is not None:
+            budget = self._max_context_tokens - self._compaction_buffer_tokens
+            return self.estimate_tokens() >= max(0, budget)
+        return self.estimate_tokens() >= self._max_context_tokens * self._compaction_threshold
+
+    def compaction_warning(self) -> bool:
+        """True when the conversation has crossed the warning threshold
+        but not yet the hard compaction trigger.
+
+        Used by telemetry / UI to show a "context getting tight" hint
+        before a compaction pass actually runs. Returns False when no
+        warning buffer is configured (legacy behaviour).
+        """
+        if self._max_context_tokens <= 0 or self._compaction_warning_buffer_tokens is None:
+            return False
+        warn_at = self._max_context_tokens - self._compaction_warning_buffer_tokens
+        return self.estimate_tokens() >= max(0, warn_at)

    # --- Output-key extraction ---------------------------------------------

@@ -610,8 +851,15 @@ class NodeConversation:
                continue
            if msg.is_error:
                continue  # never prune errors
+            if msg.is_skill_content:
+                continue  # never prune activated skill instructions (AS-10)
            if msg.content.startswith("[Pruned tool result"):
                continue  # already pruned
+            # Tiny results (set_output acks, confirmations) — pruning
+            # saves negligible space but makes the LLM think the call
+            # failed, causing costly retries.
+            if len(msg.content) < 100:
+                continue

            # Phase-aware: protect current phase messages
            if self._current_phase and msg.phase_id == self._current_phase:
@@ -637,12 +885,14 @@ class NodeConversation:

            if spillover:
                placeholder = (
-                    f"[Pruned tool result: {orig_len} chars. "
-                    f"Full data in '{spillover}'. "
-                    f"Use load_data('{spillover}') to retrieve.]"
+                    f"Pruned tool result ({orig_len:,} chars) cleared from context. "
+                    f"Full data saved at: {spillover}\n"
+                    f"Read the complete data with read_file(path='{spillover}')."
                )
            else:
-                placeholder = f"[Pruned tool result: {orig_len} chars cleared from context.]"
+                placeholder = (
+                    f"Pruned tool result ({orig_len:,} chars) cleared from context."
+                )

            self._messages[i] = Message(
                seq=msg.seq,
@@ -653,6 +903,7 @@ class NodeConversation:
                is_error=msg.is_error,
                phase_id=msg.phase_id,
                is_transition_marker=msg.is_transition_marker,
+                run_id=msg.run_id,
            )
            count += 1

@@ -663,6 +914,81 @@ class NodeConversation:
        self._last_api_input_tokens = None
        return count

+    async def evict_old_images(self, keep_latest: int = 2) -> int:
+        """Strip ``image_content`` from older messages, keeping the most recent.
+
+        Screenshots from ``browser_screenshot`` are inlined into the
+        message's ``image_content`` as base64 data URLs. Each screenshot
+        costs ~250k tokens when the provider counts the base64 as
+        text — four screenshots push a conversation over gemini's 1M
+        context limit and trigger out-of-context garbage output (see
+        ``session_20260415_104727_5c4ed7ff`` for the terminal case
+        where the model emitted ``协日`` as its final text then stopped).
+
+        This method walks backward through messages and keeps
+        ``image_content`` intact on the most recent ``keep_latest``
+        messages that have images. Older messages get their
+        ``image_content`` nulled out — the text content (metadata
+        like url, dimensions, scale hints) stays, but the raw bytes
+        are dropped. Storage is updated too so cold-restore sees the
+        same evicted state.
+
+        Run this right after every tool result is recorded so image
+        context stays bounded even within a single iteration (the
+        compaction pipeline only fires at iteration boundaries, too
+        late for a single turn that takes 4 screenshots).
+
+        Returns the number of messages whose image_content was evicted.
+        """
+        if not self._messages or keep_latest < 0:
+            return 0
+
+        # Find messages carrying images, walking newest → oldest.
+        image_indices: list[int] = []
+        for i in range(len(self._messages) - 1, -1, -1):
+            if self._messages[i].image_content:
+                image_indices.append(i)
+
+        # Nothing to evict if we have ≤ keep_latest images total.
+        if len(image_indices) <= keep_latest:
+            return 0
+
+        # Evict everything past the first keep_latest (newest) entries.
+        to_evict = image_indices[keep_latest:]
+        evicted = 0
+        for idx in to_evict:
+            msg = self._messages[idx]
+            self._messages[idx] = Message(
+                seq=msg.seq,
+                role=msg.role,
+                content=msg.content,
+                tool_use_id=msg.tool_use_id,
+                tool_calls=msg.tool_calls,
+                is_error=msg.is_error,
+                phase_id=msg.phase_id,
+                is_transition_marker=msg.is_transition_marker,
+                is_client_input=msg.is_client_input,
+                image_content=None,  # ← dropped
+                is_skill_content=msg.is_skill_content,
+                run_id=msg.run_id,
+            )
+            evicted += 1
+            if self._store:
+                await self._store.write_part(
+                    msg.seq, self._messages[idx].to_storage_dict()
+                )
+
+        if evicted:
+            # Reset token estimate — image blocks no longer contribute.
+            self._last_api_input_tokens = None
+            logger.info(
+                "evict_old_images: dropped image_content from %d message(s), "
+                "kept %d most recent",
+                evicted,
+                keep_latest,
+            )
+        return evicted
+
    async def compact(
        self,
        summary: str,
@@ -729,14 +1055,14 @@ class NodeConversation:
            summary_seq = self._next_seq
            self._next_seq += 1

-        summary_msg = Message(seq=summary_seq, role="user", content=summary)
+        summary_msg = Message(seq=summary_seq, role="user", content=summary, run_id=self._run_id)

        # Persist
        if self._store:
            delete_before = recent_messages[0].seq if recent_messages else self._next_seq
            await self._store.delete_parts_before(delete_before)
            await self._store.write_part(summary_msg.seq, summary_msg.to_storage_dict())
-            await self._store.write_cursor({"next_seq": self._next_seq})
+            await self._write_next_seq()

        self._messages = [summary_msg] + recent_messages
        self._last_api_input_tokens = None  # reset; next LLM call will recalibrate
@@ -794,6 +1120,15 @@ class NodeConversation:
        freeform_lines: list[str] = []
        collapsed_msgs: list[Message] = []

+        # Collect all tool_use IDs present in old messages so we can detect
+        # orphaned tool results whose parent assistant message was already
+        # compacted away (API invariant protection).
+        old_tc_ids: set[str] = set()
+        for msg in old_messages:
+            if msg.tool_calls:
+                for tc in msg.tool_calls:
+                    old_tc_ids.add(tc.get("id", ""))
+
        if aggressive:
            # Aggressive: only keep set_output tool pairs and error results.
            # Everything else is collapsed into a tool-call history summary.
@@ -815,9 +1150,17 @@ class NodeConversation:
                else:
                    collapsible_tc_ids |= tc_ids

+            # Skill content and transition markers are always protected
+            for msg in old_messages:
+                if msg.role == "tool" and msg.is_skill_content and msg.tool_use_id:
+                    protected_tc_ids.add(msg.tool_use_id)
+
            # Second pass: classify all messages
            for msg in old_messages:
-                if msg.role == "tool":
+                if msg.is_transition_marker:
+                    # Transition markers are always kept (phase boundaries)
+                    kept_structural.append(msg)
+                elif msg.role == "tool":
                    tc_id = msg.tool_use_id or ""
                    if tc_id in protected_tc_ids:
                        kept_structural.append(msg)
@@ -826,6 +1169,12 @@ class NodeConversation:
                        kept_structural.append(msg)
                        # Protect the parent assistant message too
                        protected_tc_ids.add(tc_id)
+                    elif msg.is_skill_content:
+                        kept_structural.append(msg)
+                    elif tc_id and tc_id not in old_tc_ids:
+                        # Orphaned tool result — parent tool_use not in old msgs.
+                        # Keep it to maintain API invariants.
+                        kept_structural.append(msg)
                    else:
                        collapsed_msgs.append(msg)
                elif msg.role == "assistant" and msg.tool_calls:
@@ -842,6 +1191,7 @@ class NodeConversation:
                                is_error=msg.is_error,
                                phase_id=msg.phase_id,
                                is_transition_marker=msg.is_transition_marker,
+                                run_id=msg.run_id,
                            )
                        )
                    else:
@@ -856,7 +1206,10 @@ class NodeConversation:
        else:
            # Standard mode: keep all tool call pairs as structural
            for msg in old_messages:
-                if msg.role == "tool":
+                if msg.is_transition_marker:
+                    # Transition markers are always kept (phase boundaries)
+                    kept_structural.append(msg)
+                elif msg.role == "tool":
                    kept_structural.append(msg)
                elif msg.role == "assistant" and msg.tool_calls:
                    compact_tcs = _compact_tool_calls(msg.tool_calls)
@@ -869,6 +1222,7 @@ class NodeConversation:
                            is_error=msg.is_error,
                            phase_id=msg.phase_id,
                            is_transition_marker=msg.is_transition_marker,
+                            run_id=msg.run_id,
                        )
                    )
                else:
@@ -895,15 +1249,18 @@ class NodeConversation:
            # Nothing to save — skip file creation
            conv_filename = ""

-        # Build reference message
+        # Build reference message. Prose format (no brackets) — see the
+        # poison-pattern note on truncate_tool_result. Frontier models
+        # autocomplete `[...']` trailers into their own text turns.
        ref_parts: list[str] = []
        if conv_filename:
+            full_path = str((spill_path / conv_filename).resolve())
            ref_parts.append(
-                f"[Previous conversation saved to '{conv_filename}'. "
-                f"Use load_data('{conv_filename}') to review if needed.]"
+                f"Previous conversation saved at: {full_path}\n"
+                f"Read the full transcript with read_file('{conv_filename}')."
            )
        elif not collapsed_msgs:
-            ref_parts.append("[Previous freeform messages compacted.]")
+            ref_parts.append("(Previous freeform messages compacted.)")

        # Aggressive: add collapsed tool-call history to the reference
        if collapsed_msgs:
@@ -925,7 +1282,7 @@ class NodeConversation:
            ref_seq = self._next_seq
            self._next_seq += 1

-        ref_msg = Message(seq=ref_seq, role="user", content=ref_content)
+        ref_msg = Message(seq=ref_seq, role="user", content=ref_content, run_id=self._run_id)

        # Persist: delete old messages from store, write reference + kept structural.
        # In aggressive mode, collapsed messages may be interspersed with kept
@@ -939,7 +1296,7 @@ class NodeConversation:
            # Write kept structural messages (they may have been modified)
            for msg in kept_structural:
                await self._store.write_part(msg.seq, msg.to_storage_dict())
-            await self._store.write_cursor({"next_seq": self._next_seq})
+            await self._write_next_seq()

        # Reassemble: reference + kept structural (in original order) + recent
        self._messages = [ref_msg] + kept_structural + recent_messages
@@ -976,7 +1333,7 @@ class NodeConversation:
        """Remove all messages, keep system prompt, preserve ``_next_seq``."""
        if self._store:
            await self._store.delete_parts_before(self._next_seq)
-            await self._store.write_cursor({"next_seq": self._next_seq})
+            await self._write_next_seq()
        self._messages.clear()
        self._last_api_input_tokens = None

@@ -1018,22 +1375,36 @@ class NodeConversation:
        if not self._meta_persisted:
            await self._persist_meta()
        await self._store.write_part(message.seq, message.to_storage_dict())
-        await self._store.write_cursor({"next_seq": self._next_seq})
+        await self._write_next_seq()

    async def _persist_meta(self) -> None:
-        """Lazily write conversation metadata to the store (called once)."""
+        """Lazily write conversation metadata to the store (called once).
+
+        When ``self._run_id`` is set, metadata is written flat for backward
+        compatibility (run-scoped isolation has been reverted).
+        """
        if self._store is None:
            return
-        await self._store.write_meta(
-            {
-                "system_prompt": self._system_prompt,
-                "max_history_tokens": self._max_history_tokens,
-                "compaction_threshold": self._compaction_threshold,
-                "output_keys": self._output_keys,
-            }
-        )
+        run_meta = {
+            "system_prompt": self._system_prompt,
+            "max_context_tokens": self._max_context_tokens,
+            "compaction_threshold": self._compaction_threshold,
+            "compaction_buffer_tokens": self._compaction_buffer_tokens,
+            "compaction_warning_buffer_tokens": (
+                self._compaction_warning_buffer_tokens
+            ),
+            "output_keys": self._output_keys,
+        }
+        await self._store.write_meta(run_meta)
        self._meta_persisted = True

+    async def _write_next_seq(self) -> None:
+        if self._store is None:
+            return
+        cursor = await self._store.read_cursor() or {}
+        cursor["next_seq"] = self._next_seq
+        await self._store.write_cursor(cursor)
+
    # --- Restore -----------------------------------------------------------

    @classmethod
@@ -1041,6 +1412,7 @@ class NodeConversation:
        cls,
        store: ConversationStore,
        phase_id: str | None = None,
+        run_id: str | None = None,
    ) -> NodeConversation | None:
        """Reconstruct a NodeConversation from a store.

@@ -1050,6 +1422,9 @@ class NodeConversation:
                Used in isolated mode so a node only sees its own
                messages in the shared flat store.  In continuous mode
                pass ``None`` to load all parts.
+            run_id: If set, only load parts matching this run_id.
+                Ensures intentional restarts (new run_id) start fresh
+                while crash recovery (same run_id) resumes correctly.

        Returns ``None`` if the store contains no metadata (i.e. the
        conversation was never persisted).
@@ -1060,21 +1435,45 @@ class NodeConversation:

        conv = cls(
            system_prompt=meta.get("system_prompt", ""),
-            max_history_tokens=meta.get("max_history_tokens", 32000),
+            max_context_tokens=meta.get("max_context_tokens", 32000),
            compaction_threshold=meta.get("compaction_threshold", 0.8),
            output_keys=meta.get("output_keys"),
            store=store,
+            run_id=run_id,
+            compaction_buffer_tokens=meta.get("compaction_buffer_tokens"),
+            compaction_warning_buffer_tokens=meta.get(
+                "compaction_warning_buffer_tokens"
+            ),
        )
        conv._meta_persisted = True

        parts = await store.read_parts()
        if phase_id:
-            parts = [p for p in parts if p.get("phase_id") == phase_id]
+            filtered_parts = [p for p in parts if p.get("phase_id") == phase_id]
+            if filtered_parts:
+                parts = filtered_parts
+            elif parts and all(p.get("phase_id") is None for p in parts):
+                # Backward compatibility: older isolated stores (including queen
+                # sessions) persisted parts without phase_id. In that case, the
+                # phase filter would incorrectly hide the entire conversation.
+                logger.info(
+                    "Restoring legacy unphased conversation without applying "
+                    "phase filter (phase_id=%s, parts=%d)",
+                    phase_id,
+                    len(parts),
+                )
+            else:
+                parts = filtered_parts
+        # Filter by run_id so intentional restarts (new run_id) start fresh
+        # while crash recovery (same run_id) loads prior parts.
+        if run_id and not is_legacy_run_id(run_id):
+            parts = [p for p in parts if p.get("run_id") == run_id]
        conv._messages = [Message.from_storage_dict(p) for p in parts]

        cursor = await store.read_cursor()
-        if cursor:
-            conv._next_seq = cursor["next_seq"]
+        next_seq = get_cursor_next_seq(cursor)
+        if next_seq is not None:
+            conv._next_seq = next_seq
        elif conv._messages:
            conv._next_seq = conv._messages[-1].seq + 1

--- a/Show More
+++ b/Show More