Merge pull request #6624 from aden-hive/feature/agent-skills
Release / Create Release (push) Waiting to run

feat: agent skills system and observability improvements
This commit is contained in:
Timothy @aden
2026-03-18 20:28:34 -07:00
committed by GitHub
38 changed files with 2469 additions and 154 deletions
+10
View File
@@ -89,6 +89,16 @@ def main():
register_testing_commands(subparsers)
# Register skill commands (skill list, skill trust, ...)
from framework.skills.cli import register_skill_commands
register_skill_commands(subparsers)
# Register debugger commands (debugger)
from framework.debugger.cli import register_debugger_commands
register_debugger_commands(subparsers)
args = parser.parse_args()
if hasattr(args, "func"):
View File
+76
View File
@@ -0,0 +1,76 @@
"""CLI command for the LLM debug log viewer."""
import argparse
import subprocess
import sys
from pathlib import Path
_SCRIPT = Path(__file__).resolve().parents[3] / "scripts" / "llm_debug_log_visualizer.py"
def register_debugger_commands(subparsers: argparse._SubParsersAction) -> None:
"""Register the ``hive debugger`` command."""
parser = subparsers.add_parser(
"debugger",
help="Open the LLM debug log viewer",
description=(
"Start a local server that lets you browse LLM debug sessions "
"recorded in ~/.hive/llm_logs. Sessions are loaded on demand so "
"the browser stays responsive."
),
)
parser.add_argument(
"--session",
help="Execution ID to select initially.",
)
parser.add_argument(
"--port",
type=int,
default=0,
help="Port for the local server (0 = auto-pick a free port).",
)
parser.add_argument(
"--logs-dir",
help="Directory containing JSONL log files (default: ~/.hive/llm_logs).",
)
parser.add_argument(
"--limit-files",
type=int,
default=None,
help="Maximum number of newest log files to scan (default: 200).",
)
parser.add_argument(
"--output",
help="Write a static HTML file instead of starting a server.",
)
parser.add_argument(
"--no-open",
action="store_true",
help="Start the server but do not open a browser.",
)
parser.add_argument(
"--include-tests",
action="store_true",
help="Show test/mock sessions (hidden by default).",
)
parser.set_defaults(func=cmd_debugger)
def cmd_debugger(args: argparse.Namespace) -> int:
"""Launch the LLM debug log visualizer."""
cmd: list[str] = [sys.executable, str(_SCRIPT)]
if args.session:
cmd += ["--session", args.session]
if args.port:
cmd += ["--port", str(args.port)]
if args.logs_dir:
cmd += ["--logs-dir", args.logs_dir]
if args.limit_files is not None:
cmd += ["--limit-files", str(args.limit_files)]
if args.output:
cmd += ["--output", args.output]
if args.no_open:
cmd.append("--no-open")
if args.include_tests:
cmd.append("--include-tests")
return subprocess.call(cmd)
+6
View File
@@ -33,6 +33,8 @@ class Message:
is_transition_marker: bool = False
# True when this message is real human input (from /chat), not a system prompt
is_client_input: bool = False
# True when message contains an activated skill body (AS-10: never prune)
is_skill_content: bool = False
def to_llm_dict(self) -> dict[str, Any]:
"""Convert to OpenAI-format message dict."""
@@ -409,6 +411,7 @@ class NodeConversation:
tool_use_id: str,
content: str,
is_error: bool = False,
is_skill_content: bool = False,
) -> Message:
msg = Message(
seq=self._next_seq,
@@ -417,6 +420,7 @@ class NodeConversation:
tool_use_id=tool_use_id,
is_error=is_error,
phase_id=self._current_phase,
is_skill_content=is_skill_content,
)
self._messages.append(msg)
self._next_seq += 1
@@ -610,6 +614,8 @@ class NodeConversation:
continue
if msg.is_error:
continue # never prune errors
if msg.is_skill_content:
continue # never prune activated skill instructions (AS-10)
if msg.content.startswith("[Pruned tool result"):
continue # already pruned
# Tiny results (set_output acks, confirmations) — pruning
+215 -10
View File
@@ -467,6 +467,8 @@ class EventLoopNode(NodeProtocol):
stream_id = ctx.stream_id or ctx.node_id
node_id = ctx.node_id
execution_id = ctx.execution_id or ""
# Store skill dirs for AS-9 file-read interception in _execute_tool
self._skill_dirs: list[str] = ctx.skill_dirs
# Verdict counters for runtime logging
_accept_count = _retry_count = _escalate_count = _continue_count = 0
@@ -806,6 +808,13 @@ class EventLoopNode(NodeProtocol):
execution_id,
extra_data=_iter_meta,
)
# Sync max_context_tokens from live config so mid-session model
# switches are reflected in compaction decisions and the UI bar.
from framework.config import get_max_context_tokens as _live_mct
conversation._max_context_tokens = _live_mct()
await self._publish_context_usage(ctx, conversation, "iteration_start")
# 6d. Pre-turn compaction check (tiered)
_compacted_this_iter = False
@@ -2726,6 +2735,7 @@ class EventLoopNode(NodeProtocol):
tool_use_id=tc.tool_use_id,
content=result.content,
is_error=result.is_error,
is_skill_content=result.is_skill_content,
)
if (
tc.tool_name in ("ask_user", "ask_user_multiple")
@@ -2834,6 +2844,8 @@ class EventLoopNode(NodeProtocol):
conversation.usage_ratio() * 100,
)
await self._publish_context_usage(ctx, conversation, "post_tool_results")
# If the turn requested external input (ask_user or queen handoff),
# return immediately so the outer loop can block before judge eval.
if user_input_requested or queen_input_requested:
@@ -3549,6 +3561,33 @@ class EventLoopNode(NodeProtocol):
content=f"No tool executor configured for '{tc.tool_name}'",
is_error=True,
)
# AS-9: Intercept file-read tools for skill directories — bypass session sandbox
_SKILL_READ_TOOLS = {"view_file", "load_data", "read_file"}
skill_dirs = getattr(self, "_skill_dirs", [])
if tc.tool_name in _SKILL_READ_TOOLS and skill_dirs:
_path = tc.tool_input.get("path", "")
if _path:
import os
from pathlib import Path as _Path
_resolved = os.path.realpath(os.path.abspath(_path))
if any(_resolved.startswith(os.path.realpath(d)) for d in skill_dirs):
try:
_content = _Path(_resolved).read_text(encoding="utf-8")
_is_skill_md = _resolved.endswith("SKILL.md")
return ToolResult(
tool_use_id=tc.tool_use_id,
content=_content,
is_skill_content=_is_skill_md, # AS-10: protect SKILL.md reads
)
except Exception as _exc:
return ToolResult(
tool_use_id=tc.tool_use_id,
content=f"Could not read skill resource '{_path}': {_exc}",
is_error=True,
)
tool_use = ToolUse(id=tc.tool_use_id, name=tc.tool_name, input=tc.tool_input)
timeout = self._config.tool_call_timeout_seconds
@@ -3980,6 +4019,12 @@ class EventLoopNode(NodeProtocol):
ratio_before = conversation.usage_ratio()
phase_grad = getattr(ctx, "continuous_mode", False)
# Capture pre-compaction message inventory when over budget,
# since compaction mutates the conversation in place.
pre_inventory: list[dict[str, Any]] | None = None
if ratio_before >= 1.0:
pre_inventory = self._build_message_inventory(conversation)
# --- Step 1: Prune old tool results (free, no LLM) ---
protect = max(2000, self._config.max_context_tokens // 12)
pruned = await conversation.prune_old_tool_results(
@@ -3994,7 +4039,7 @@ class EventLoopNode(NodeProtocol):
conversation.usage_ratio() * 100,
)
if not conversation.needs_compaction():
await self._log_compaction(ctx, conversation, ratio_before)
await self._log_compaction(ctx, conversation, ratio_before, pre_inventory)
return
# --- Step 2: Standard structure-preserving compaction (free, no LLM) ---
@@ -4007,7 +4052,7 @@ class EventLoopNode(NodeProtocol):
phase_graduated=phase_grad,
)
if not conversation.needs_compaction():
await self._log_compaction(ctx, conversation, ratio_before)
await self._log_compaction(ctx, conversation, ratio_before, pre_inventory)
return
# --- Step 3: LLM summary compaction ---
@@ -4034,7 +4079,7 @@ class EventLoopNode(NodeProtocol):
logger.warning("LLM compaction failed: %s", e)
if not conversation.needs_compaction():
await self._log_compaction(ctx, conversation, ratio_before)
await self._log_compaction(ctx, conversation, ratio_before, pre_inventory)
return
# --- Step 4: Emergency deterministic summary (LLM failed/unavailable) ---
@@ -4048,7 +4093,7 @@ class EventLoopNode(NodeProtocol):
keep_recent=1,
phase_graduated=phase_grad,
)
await self._log_compaction(ctx, conversation, ratio_before)
await self._log_compaction(ctx, conversation, ratio_before, pre_inventory)
# --- LLM compaction with binary-search splitting ----------------------
@@ -4210,13 +4255,59 @@ class EventLoopNode(NodeProtocol):
"re-doing work.\n"
)
@staticmethod
def _build_message_inventory(
conversation: NodeConversation,
) -> list[dict[str, Any]]:
"""Build a per-message size inventory for debug logging."""
inventory: list[dict[str, Any]] = []
for m in conversation.messages:
content_chars = len(m.content)
tc_chars = 0
tool_name = None
if m.tool_calls:
for tc in m.tool_calls:
args = tc.get("function", {}).get("arguments", "")
tc_chars += len(args) if isinstance(args, str) else len(json.dumps(args))
names = [tc.get("function", {}).get("name", "?") for tc in m.tool_calls]
tool_name = ", ".join(names)
elif m.role == "tool" and m.tool_use_id:
for prev in conversation.messages:
if prev.tool_calls:
for tc in prev.tool_calls:
if tc.get("id") == m.tool_use_id:
tool_name = tc.get("function", {}).get("name", "?")
break
if tool_name:
break
entry: dict[str, Any] = {
"seq": m.seq,
"role": m.role,
"content_chars": content_chars,
}
if tc_chars:
entry["tool_call_args_chars"] = tc_chars
if tool_name:
entry["tool"] = tool_name
if m.is_error:
entry["is_error"] = True
if m.phase_id:
entry["phase"] = m.phase_id
if content_chars > 2000:
entry["preview"] = m.content[:200] + ""
inventory.append(entry)
return inventory
async def _log_compaction(
self,
ctx: NodeContext,
conversation: NodeConversation,
ratio_before: float,
pre_inventory: list[dict[str, Any]] | None = None,
) -> None:
"""Log compaction result to runtime logger and event bus."""
"""Log compaction result to runtime logger, event bus, and debug file."""
import os as _os
ratio_after = conversation.usage_ratio()
before_pct = round(ratio_before * 100)
after_pct = round(ratio_after * 100)
@@ -4249,19 +4340,103 @@ class EventLoopNode(NodeProtocol):
if self._event_bus:
from framework.runtime.event_bus import AgentEvent, EventType
event_data: dict[str, Any] = {
"level": level,
"usage_before": before_pct,
"usage_after": after_pct,
}
if pre_inventory is not None:
event_data["message_inventory"] = pre_inventory
await self._event_bus.publish(
AgentEvent(
type=EventType.CONTEXT_COMPACTED,
stream_id=ctx.stream_id or ctx.node_id,
node_id=ctx.node_id,
data={
"level": level,
"usage_before": before_pct,
"usage_after": after_pct,
},
data=event_data,
)
)
# Emit post-compaction usage update
await self._publish_context_usage(ctx, conversation, "post_compaction")
# Write detailed debug log to ~/.hive/compaction_log/ when enabled
if _os.environ.get("HIVE_COMPACTION_DEBUG"):
self._write_compaction_debug_log(ctx, before_pct, after_pct, level, pre_inventory)
@staticmethod
def _write_compaction_debug_log(
ctx: NodeContext,
before_pct: int,
after_pct: int,
level: str,
inventory: list[dict[str, Any]] | None,
) -> None:
"""Write detailed compaction analysis to ~/.hive/compaction_log/."""
log_dir = Path.home() / ".hive" / "compaction_log"
log_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now(UTC).strftime("%Y%m%dT%H%M%S_%f")
node_label = ctx.node_id.replace("/", "_")
log_path = log_dir / f"{ts}_{node_label}.md"
lines: list[str] = [
f"# Compaction Debug — {ctx.node_id}",
f"**Time:** {datetime.now(UTC).isoformat()}",
f"**Node:** {ctx.node_spec.name} (`{ctx.node_id}`)",
]
if ctx.stream_id:
lines.append(f"**Stream:** {ctx.stream_id}")
lines.append(f"**Level:** {level}")
lines.append(f"**Usage:** {before_pct}% → {after_pct}%")
lines.append("")
if inventory:
total_chars = sum(
e.get("content_chars", 0) + e.get("tool_call_args_chars", 0) for e in inventory
)
lines.append(
f"## Pre-Compaction Message Inventory "
f"({len(inventory)} messages, {total_chars:,} total chars)"
)
lines.append("")
ranked = sorted(
inventory,
key=lambda e: e.get("content_chars", 0) + e.get("tool_call_args_chars", 0),
reverse=True,
)
lines.append("| # | seq | role | tool | chars | % of total | flags |")
lines.append("|---|-----|------|------|------:|------------|-------|")
for i, entry in enumerate(ranked, 1):
chars = entry.get("content_chars", 0) + entry.get("tool_call_args_chars", 0)
pct = (chars / total_chars * 100) if total_chars else 0
tool = entry.get("tool", "")
flags = []
if entry.get("is_error"):
flags.append("error")
if entry.get("phase"):
flags.append(f"phase={entry['phase']}")
lines.append(
f"| {i} | {entry['seq']} | {entry['role']} | {tool} "
f"| {chars:,} | {pct:.1f}% | {', '.join(flags)} |"
)
large = [e for e in ranked if e.get("preview")]
if large:
lines.append("")
lines.append("### Large message previews")
for entry in large:
lines.append(
f"\n**seq={entry['seq']}** ({entry['role']}, {entry.get('tool', '')}):"
)
lines.append(f"```\n{entry['preview']}\n```")
lines.append("")
try:
log_path.write_text("\n".join(lines), encoding="utf-8")
logger.debug("Compaction debug log written to %s", log_path)
except OSError:
logger.debug("Failed to write compaction debug log to %s", log_path)
def _build_emergency_summary(
self,
ctx: NodeContext,
@@ -4666,6 +4841,36 @@ class EventLoopNode(NodeProtocol):
if result.inject:
await conversation.add_user_message(result.inject)
async def _publish_context_usage(
self,
ctx: NodeContext,
conversation: NodeConversation,
trigger: str,
) -> None:
"""Emit a CONTEXT_USAGE_UPDATED event with current context window state."""
if not self._event_bus:
return
from framework.runtime.event_bus import AgentEvent, EventType
estimated = conversation.estimate_tokens()
max_tokens = conversation._max_context_tokens
ratio = estimated / max_tokens if max_tokens > 0 else 0.0
await self._event_bus.publish(
AgentEvent(
type=EventType.CONTEXT_USAGE_UPDATED,
stream_id=ctx.stream_id or ctx.node_id,
node_id=ctx.node_id,
data={
"usage_ratio": round(ratio, 4),
"usage_pct": round(ratio * 100),
"message_count": conversation.message_count,
"estimated_tokens": estimated,
"max_context_tokens": max_tokens,
"trigger": trigger,
},
)
)
async def _publish_iteration(
self,
stream_id: str,
+7
View File
@@ -154,6 +154,7 @@ class GraphExecutor:
iteration_metadata_provider: Callable | None = None,
skills_catalog_prompt: str = "",
protocols_prompt: str = "",
skill_dirs: list[str] | None = None,
):
"""
Initialize the executor.
@@ -181,6 +182,7 @@ class GraphExecutor:
system prompt (for phase switching)
skills_catalog_prompt: Available skills catalog for system prompt
protocols_prompt: Default skill operational protocols for system prompt
skill_dirs: Skill base directories for Tier 3 resource access
"""
self.runtime = runtime
self.llm = llm
@@ -204,6 +206,7 @@ class GraphExecutor:
self.iteration_metadata_provider = iteration_metadata_provider
self.skills_catalog_prompt = skills_catalog_prompt
self.protocols_prompt = protocols_prompt
self.skill_dirs: list[str] = skill_dirs or []
if protocols_prompt:
self.logger.info(
@@ -1845,6 +1848,9 @@ class GraphExecutor:
existing_underscore = [k for k in memory._data if k.startswith("_")]
extra_keys = set(_skill_keys) | set(existing_underscore)
# Only inject into read_keys when it was already non-empty — an empty
# read_keys means "allow all reads" and injecting skill keys would
# inadvertently restrict reads to skill keys only.
for k in extra_keys:
if read_keys and k not in read_keys:
read_keys.append(k)
@@ -1899,6 +1905,7 @@ class GraphExecutor:
iteration_metadata_provider=self.iteration_metadata_provider,
skills_catalog_prompt=self.skills_catalog_prompt,
protocols_prompt=self.protocols_prompt,
skill_dirs=self.skill_dirs,
)
VALID_NODE_TYPES = {
+1
View File
@@ -568,6 +568,7 @@ class NodeContext:
# Skill system prompts — injected by the skill discovery pipeline
skills_catalog_prompt: str = "" # Available skills XML catalog
protocols_prompt: str = "" # Default skill operational protocols
skill_dirs: list[str] = field(default_factory=list) # Skill base dirs for resource access
# Per-iteration metadata provider — when set, EventLoopNode merges
# the returned dict into node_loop_iteration event data. Used by
+1
View File
@@ -45,6 +45,7 @@ class ToolResult:
tool_use_id: str
content: str
is_error: bool = False
is_skill_content: bool = False # AS-10: marks activated skill body, protected from pruning
class LLMProvider(ABC):
@@ -1,5 +1,6 @@
"""Shared MCP client connection management."""
import logging
import threading
from typing import Any
@@ -7,6 +8,8 @@ import httpx
from framework.runner.mcp_client import MCPClient, MCPServerConfig
logger = logging.getLogger(__name__)
class MCPConnectionManager:
"""Process-wide MCP client pool keyed by server name."""
@@ -46,8 +49,14 @@ class MCPConnectionManager:
with self._pool_lock:
client = self._pool.get(server_name)
if self._is_connected(client) and server_name not in self._transitions:
self._refcounts[server_name] = self._refcounts.get(server_name, 0) + 1
new_refcount = self._refcounts.get(server_name, 0) + 1
self._refcounts[server_name] = new_refcount
self._configs[server_name] = config
logger.debug(
"Reusing pooled connection for MCP server '%s' (refcount=%d)",
server_name,
new_refcount,
)
return client
transition_event = self._transitions.get(server_name)
+4 -1
View File
@@ -1,6 +1,6 @@
"""Pre-load validation for agent graphs.
Runs structural and credential checks before MCP servers are spawned.
Runs structural, credential, and skill-trust checks before MCP servers are spawned.
Fails fast with actionable error messages.
"""
@@ -169,6 +169,9 @@ def run_preload_validation(
1. Graph structure (includes GCU subagent-only checks) non-recoverable
2. Credentials potentially recoverable via interactive setup
Skill discovery and trust gating (AS-13) happen later in runner._setup()
so they have access to agent-level skill configuration.
Raises PreloadValidationError for structural issues.
Raises CredentialError for credential issues.
"""
+5 -1
View File
@@ -1343,7 +1343,7 @@ class AgentRunner:
except Exception:
pass # Best-effort — agent works without account info
# Skill configuration — the runtime handles discovery, loading, and
# Skill configuration — the runtime handles discovery, loading, trust-gating and
# prompt rasterization. The runner just builds the config.
from framework.skills.config import SkillsConfig
from framework.skills.manager import SkillsManagerConfig
@@ -1354,6 +1354,7 @@ class AgentRunner:
skills=getattr(self, "_agent_skills", None),
),
project_root=self.agent_path,
interactive=self._interactive,
)
self._setup_agent_runtime(
@@ -1465,6 +1466,9 @@ class AgentRunner:
accounts_data: list[dict] | None = None,
tool_provider_map: dict[str, str] | None = None,
event_bus=None,
skills_catalog_prompt: str = "",
protocols_prompt: str = "",
skill_dirs: list[str] | None = None,
skills_manager_config=None,
) -> None:
"""Set up multi-entry-point execution using AgentRuntime."""
+1 -1
View File
@@ -482,7 +482,7 @@ class ToolRegistry:
def register_mcp_server(
self,
server_config: dict[str, Any],
use_connection_manager: bool = False,
use_connection_manager: bool = True,
) -> int:
"""
Register an MCP server and discover its tools.
+13
View File
@@ -137,6 +137,7 @@ class AgentRuntime:
# Deprecated — pass skills_manager_config instead.
skills_catalog_prompt: str = "",
protocols_prompt: str = "",
skill_dirs: list[str] | None = None,
):
"""
Initialize agent runtime.
@@ -158,6 +159,9 @@ class AgentRuntime:
event_bus: Optional external EventBus. If provided, the runtime shares
this bus instead of creating its own. Used by SessionManager to
share a single bus between queen, worker, and judge.
skills_catalog_prompt: Available skills catalog for system prompt
protocols_prompt: Default skill operational protocols for system prompt
skill_dirs: Skill base directories for Tier 3 resource access
skills_manager_config: Skill configuration the runtime owns
discovery, loading, and prompt renderation internally.
skills_catalog_prompt: Deprecated. Pre-rendered skills catalog.
@@ -195,6 +199,8 @@ class AgentRuntime:
self._skills_manager = SkillsManager()
self._skills_manager.load()
self.skill_dirs: list[str] = self._skills_manager.allowlisted_dirs
# Primary graph identity
self._graph_id: str = graph_id or "primary"
@@ -341,6 +347,7 @@ class AgentRuntime:
tool_provider_map=self._tool_provider_map,
skills_catalog_prompt=self.skills_catalog_prompt,
protocols_prompt=self.protocols_prompt,
skill_dirs=self.skill_dirs,
)
await stream.start()
self._streams[ep_id] = stream
@@ -977,6 +984,7 @@ class AgentRuntime:
tool_provider_map=self._tool_provider_map,
skills_catalog_prompt=self.skills_catalog_prompt,
protocols_prompt=self.protocols_prompt,
skill_dirs=self.skill_dirs,
)
if self._running:
await stream.start()
@@ -1760,6 +1768,7 @@ def create_agent_runtime(
# Deprecated — pass skills_manager_config instead.
skills_catalog_prompt: str = "",
protocols_prompt: str = "",
skill_dirs: list[str] | None = None,
) -> AgentRuntime:
"""
Create and configure an AgentRuntime with entry points.
@@ -1786,6 +1795,9 @@ def create_agent_runtime(
accounts_data: Raw account data for per-node prompt generation.
tool_provider_map: Tool name to provider name mapping for account routing.
event_bus: Optional external EventBus to share with other components.
skills_catalog_prompt: Available skills catalog for system prompt.
protocols_prompt: Default skill operational protocols for system prompt.
skill_dirs: Skill base directories for Tier 3 resource access.
skills_manager_config: Skill configuration the runtime owns
discovery, loading, and prompt renderation internally.
skills_catalog_prompt: Deprecated. Pre-rendered skills catalog.
@@ -1819,6 +1831,7 @@ def create_agent_runtime(
skills_manager_config=skills_manager_config,
skills_catalog_prompt=skills_catalog_prompt,
protocols_prompt=protocols_prompt,
skill_dirs=skill_dirs,
)
for spec in entry_points:
+1
View File
@@ -117,6 +117,7 @@ class EventType(StrEnum):
# Context management
CONTEXT_COMPACTED = "context_compacted"
CONTEXT_USAGE_UPDATED = "context_usage_updated"
# External triggers
WEBHOOK_RECEIVED = "webhook_received"
@@ -188,6 +188,7 @@ class ExecutionStream:
tool_provider_map: dict[str, str] | None = None,
skills_catalog_prompt: str = "",
protocols_prompt: str = "",
skill_dirs: list[str] | None = None,
):
"""
Initialize execution stream.
@@ -213,6 +214,7 @@ class ExecutionStream:
tool_provider_map: Tool name to provider name mapping for account routing
skills_catalog_prompt: Available skills catalog for system prompt
protocols_prompt: Default skill operational protocols for system prompt
skill_dirs: Skill base directories for Tier 3 resource access
"""
self.stream_id = stream_id
self.entry_spec = entry_spec
@@ -236,6 +238,7 @@ class ExecutionStream:
self._tool_provider_map = tool_provider_map
self._skills_catalog_prompt = skills_catalog_prompt
self._protocols_prompt = protocols_prompt
self._skill_dirs: list[str] = skill_dirs or []
_es_logger = logging.getLogger(__name__)
if protocols_prompt:
@@ -696,6 +699,7 @@ class ExecutionStream:
tool_provider_map=self._tool_provider_map,
skills_catalog_prompt=self._skills_catalog_prompt,
protocols_prompt=self._protocols_prompt,
skill_dirs=self._skill_dirs,
)
# Track executor so inject_input() can reach EventLoopNode instances
self._active_executors[execution_id] = executor
@@ -8,6 +8,7 @@ write. Errors are silently swallowed — this must never break the agent.
import json
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import IO, Any
@@ -47,6 +48,9 @@ def log_llm_turn(
Never raises.
"""
try:
# Skip logging during test runs to avoid polluting real logs.
if os.environ.get("PYTEST_CURRENT_TEST") or os.environ.get("HIVE_DISABLE_LLM_LOGS"):
return
global _log_file, _log_ready # noqa: PLW0603
if not _log_ready:
_log_file = _open_log()
+1
View File
@@ -37,6 +37,7 @@ DEFAULT_EVENT_TYPES = [
EventType.NODE_RETRY,
EventType.NODE_TOOL_DOOM_LOOP,
EventType.CONTEXT_COMPACTED,
EventType.CONTEXT_USAGE_UPDATED,
EventType.WORKER_LOADED,
EventType.CREDENTIALS_REQUIRED,
EventType.SUBAGENT_REPORT,
+4 -3
View File
@@ -819,10 +819,11 @@ class SessionManager:
exec_id = event.execution_id
if event.type == _ET.EXECUTION_STARTED:
# New run on this execution_id — reset cooldown so the first
# iteration always produces a mid-run snapshot.
# New run on this execution_id — start the cooldown timer so
# mid-run snapshots don't fire immediately at session start.
# The first snapshot will happen after _DIGEST_COOLDOWN seconds.
if exec_id:
_last_digest.pop(exec_id, None)
_last_digest[exec_id] = _time.monotonic()
elif event.type in (
_ET.EXECUTION_COMPLETED,
+7 -2
View File
@@ -1,8 +1,8 @@
"""Hive Agent Skills — discovery, parsing, and injection of SKILL.md packages.
"""Hive Agent Skills — discovery, parsing, trust gating, and injection of SKILL.md packages.
Implements the open Agent Skills standard (agentskills.io) for portable
skill discovery and activation, plus built-in default skills for runtime
operational discipline.
operational discipline, and AS-13 trust gating for project-scope skills.
"""
from framework.skills.catalog import SkillCatalog
@@ -10,7 +10,9 @@ from framework.skills.config import DefaultSkillConfig, SkillsConfig
from framework.skills.defaults import DefaultSkillManager
from framework.skills.discovery import DiscoveryConfig, SkillDiscovery
from framework.skills.manager import SkillsManager, SkillsManagerConfig
from framework.skills.models import TrustStatus
from framework.skills.parser import ParsedSkill, parse_skill_md
from framework.skills.trust import TrustedRepoStore, TrustGate
__all__ = [
"DefaultSkillConfig",
@@ -22,5 +24,8 @@ __all__ = [
"SkillsConfig",
"SkillsManager",
"SkillsManagerConfig",
"TrustGate",
"TrustedRepoStore",
"TrustStatus",
"parse_skill_md",
]
+1
View File
@@ -76,6 +76,7 @@ class SkillCatalog:
lines.append(f" <name>{escape(skill.name)}</name>")
lines.append(f" <description>{escape(skill.description)}</description>")
lines.append(f" <location>{escape(skill.location)}</location>")
lines.append(f" <base_dir>{escape(skill.base_dir)}</base_dir>")
lines.append(" </skill>")
lines.append("</available_skills>")
+120
View File
@@ -0,0 +1,120 @@
"""CLI commands for the Hive skill system.
Phase 1 commands (AS-13):
hive skill list list discovered skills across all scopes
hive skill trust <path> permanently trust a project repo's skills
Full CLI suite (CLI-1 through CLI-13) is Phase 2.
"""
from __future__ import annotations
import subprocess
import sys
from pathlib import Path
def register_skill_commands(subparsers) -> None:
"""Register the ``hive skill`` subcommand group."""
skill_parser = subparsers.add_parser("skill", help="Manage skills")
skill_sub = skill_parser.add_subparsers(dest="skill_command", required=True)
# hive skill list
list_parser = skill_sub.add_parser("list", help="List discovered skills across all scopes")
list_parser.add_argument(
"--project-dir",
default=None,
metavar="PATH",
help="Project directory to scan (default: current directory)",
)
list_parser.set_defaults(func=cmd_skill_list)
# hive skill trust
trust_parser = skill_sub.add_parser(
"trust",
help="Permanently trust a project repository so its skills load without prompting",
)
trust_parser.add_argument(
"project_path",
help="Path to the project directory (must contain a .git with a remote origin)",
)
trust_parser.set_defaults(func=cmd_skill_trust)
def cmd_skill_list(args) -> int:
"""List all discovered skills grouped by scope."""
from framework.skills.discovery import DiscoveryConfig, SkillDiscovery
project_dir = Path(args.project_dir).resolve() if args.project_dir else Path.cwd()
skills = SkillDiscovery(DiscoveryConfig(project_root=project_dir)).discover()
if not skills:
print("No skills discovered.")
return 0
scope_headers = {
"project": "PROJECT SKILLS",
"user": "USER SKILLS",
"framework": "FRAMEWORK SKILLS",
}
for scope in ("project", "user", "framework"):
scope_skills = [s for s in skills if s.source_scope == scope]
if not scope_skills:
continue
print(f"\n{scope_headers[scope]}")
print("" * 40)
for skill in scope_skills:
print(f"{skill.name}")
print(f" {skill.description}")
print(f" {skill.location}")
return 0
def cmd_skill_trust(args) -> int:
"""Permanently trust a project repository's skills."""
from framework.skills.trust import TrustedRepoStore, _normalize_remote_url
project_path = Path(args.project_path).resolve()
if not project_path.exists():
print(f"Error: path does not exist: {project_path}", file=sys.stderr)
return 1
if not (project_path / ".git").exists():
print(
f"Error: {project_path} is not a git repository (no .git directory).",
file=sys.stderr,
)
return 1
try:
result = subprocess.run(
["git", "-C", str(project_path), "remote", "get-url", "origin"],
capture_output=True,
text=True,
timeout=3,
)
if result.returncode != 0:
print(
"Error: no remote 'origin' configured in this repository.",
file=sys.stderr,
)
return 1
remote_url = result.stdout.strip()
except subprocess.TimeoutExpired:
print("Error: git remote lookup timed out.", file=sys.stderr)
return 1
except (FileNotFoundError, OSError) as e:
print(f"Error reading git remote: {e}", file=sys.stderr)
return 1
repo_key = _normalize_remote_url(remote_url)
store = TrustedRepoStore()
store.trust(repo_key, project_path=str(project_path))
print(f"✓ Trusted: {repo_key}")
print(" Stored in ~/.hive/trusted_repos.json")
print(" Skills from this repository will load without prompting in future runs.")
return 0
+19
View File
@@ -42,11 +42,14 @@ class SkillsManagerConfig:
When ``None``, community discovery is skipped.
skip_community_discovery: Explicitly skip community scanning
even when ``project_root`` is set.
interactive: Whether trust gating can prompt the user interactively.
When ``False``, untrusted project skills are silently skipped.
"""
skills_config: SkillsConfig = field(default_factory=SkillsConfig)
project_root: Path | None = None
skip_community_discovery: bool = False
interactive: bool = True
class SkillsManager:
@@ -63,6 +66,7 @@ class SkillsManager:
self._loaded = False
self._catalog_prompt: str = ""
self._protocols_prompt: str = ""
self._allowlisted_dirs: list[str] = []
# ------------------------------------------------------------------
# Factory for backwards-compat bridge
@@ -85,6 +89,7 @@ class SkillsManager:
mgr._loaded = True # skip load()
mgr._catalog_prompt = skills_catalog_prompt
mgr._protocols_prompt = protocols_prompt
mgr._allowlisted_dirs = []
return mgr
# ------------------------------------------------------------------
@@ -113,9 +118,18 @@ class SkillsManager:
# 1. Community skill discovery (when project_root is available)
catalog_prompt = ""
if self._config.project_root is not None and not self._config.skip_community_discovery:
from framework.skills.trust import TrustGate
discovery = SkillDiscovery(DiscoveryConfig(project_root=self._config.project_root))
discovered = discovery.discover()
# Trust-gate project-scope skills (AS-13)
discovered = TrustGate(interactive=self._config.interactive).filter_and_gate(
discovered, project_dir=self._config.project_root
)
catalog = SkillCatalog(discovered)
self._allowlisted_dirs = catalog.allowlisted_dirs
catalog_prompt = catalog.to_prompt()
# Pre-activated community skills
@@ -160,6 +174,11 @@ class SkillsManager:
"""Default skill operational protocols for system prompt injection."""
return self._protocols_prompt
@property
def allowlisted_dirs(self) -> list[str]:
"""Skill base directories for Tier 3 resource access (AS-6)."""
return self._allowlisted_dirs
@property
def is_loaded(self) -> bool:
return self._loaded
+52
View File
@@ -0,0 +1,52 @@
"""Data models for the Hive skill system (Agent Skills standard)."""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import StrEnum
from pathlib import Path
class SkillScope(StrEnum):
"""Where a skill was discovered."""
PROJECT = "project"
USER = "user"
FRAMEWORK = "framework"
class TrustStatus(StrEnum):
"""Trust state of a skill entry."""
TRUSTED = "trusted"
PENDING_CONSENT = "pending_consent"
DENIED = "denied"
@dataclass
class SkillEntry:
"""In-memory record for a discovered skill (PRD §4.2)."""
name: str
"""Skill name from SKILL.md frontmatter."""
description: str
"""Skill description from SKILL.md frontmatter."""
location: Path
"""Absolute path to SKILL.md."""
base_dir: Path
"""Parent directory of SKILL.md (skill root)."""
source_scope: SkillScope
"""Which scope this skill was found in."""
trust_status: TrustStatus = TrustStatus.TRUSTED
"""Trust state; project-scope skills start as PENDING_CONSENT before gating."""
# Optional frontmatter fields
license: str | None = None
compatibility: list[str] = field(default_factory=list)
allowed_tools: list[str] = field(default_factory=list)
metadata: dict = field(default_factory=dict)
+477
View File
@@ -0,0 +1,477 @@
"""Trust gating for project-level skills (PRD AS-13).
Project-level skills from untrusted repositories require explicit user consent
before their instructions are loaded into the agent's system prompt.
Framework and user-scope skills are always trusted.
Trusted repos are persisted at ~/.hive/trusted_repos.json.
"""
from __future__ import annotations
import json
import logging
import subprocess
import sys
from collections.abc import Callable
from dataclasses import dataclass
from datetime import UTC, datetime
from enum import StrEnum
from pathlib import Path
from urllib.parse import urlparse
from framework.skills.parser import ParsedSkill
logger = logging.getLogger(__name__)
# Env var to bypass trust gating in CI/headless pipelines (opt-in).
_ENV_TRUST_ALL = "HIVE_TRUST_PROJECT_SKILLS"
# Env var for comma-separated own-remote glob patterns (e.g. "github.com/myorg/*").
_ENV_OWN_REMOTES = "HIVE_OWN_REMOTES"
_TRUSTED_REPOS_PATH = Path.home() / ".hive" / "trusted_repos.json"
_NOTICE_SENTINEL_PATH = Path.home() / ".hive" / ".skill_trust_notice_shown"
# ---------------------------------------------------------------------------
# Trusted repo store
# ---------------------------------------------------------------------------
@dataclass
class TrustedRepoEntry:
repo_key: str
added_at: datetime
project_path: str = ""
class TrustedRepoStore:
"""Persists permanently-trusted repo keys to ~/.hive/trusted_repos.json."""
def __init__(self, path: Path | None = None) -> None:
self._path = path or _TRUSTED_REPOS_PATH
self._entries: dict[str, TrustedRepoEntry] = {}
self._loaded = False
def is_trusted(self, repo_key: str) -> bool:
self._ensure_loaded()
return repo_key in self._entries
def trust(self, repo_key: str, project_path: str = "") -> None:
self._ensure_loaded()
self._entries[repo_key] = TrustedRepoEntry(
repo_key=repo_key,
added_at=datetime.now(tz=UTC),
project_path=project_path,
)
self._save()
logger.info("skill_trust_store: trusted repo_key=%s", repo_key)
def revoke(self, repo_key: str) -> bool:
self._ensure_loaded()
if repo_key in self._entries:
del self._entries[repo_key]
self._save()
logger.info("skill_trust_store: revoked repo_key=%s", repo_key)
return True
return False
def list_entries(self) -> list[TrustedRepoEntry]:
self._ensure_loaded()
return list(self._entries.values())
def _ensure_loaded(self) -> None:
if not self._loaded:
self._load()
self._loaded = True
def _load(self) -> None:
try:
data = json.loads(self._path.read_text(encoding="utf-8"))
for raw in data.get("entries", []):
repo_key = raw.get("repo_key", "")
if not repo_key:
continue
try:
added_at = datetime.fromisoformat(raw["added_at"])
except (KeyError, ValueError):
added_at = datetime.now(tz=UTC)
self._entries[repo_key] = TrustedRepoEntry(
repo_key=repo_key,
added_at=added_at,
project_path=raw.get("project_path", ""),
)
except FileNotFoundError:
pass
except Exception as e:
logger.warning(
"skill_trust_store: could not read %s (%s); treating as empty",
self._path,
e,
)
def _save(self) -> None:
self._path.parent.mkdir(parents=True, exist_ok=True)
data = {
"version": 1,
"entries": [
{
"repo_key": e.repo_key,
"added_at": e.added_at.isoformat(),
"project_path": e.project_path,
}
for e in self._entries.values()
],
}
# Atomic write: write to .tmp then rename
tmp = self._path.with_suffix(".tmp")
tmp.write_text(json.dumps(data, indent=2), encoding="utf-8")
tmp.replace(self._path)
# ---------------------------------------------------------------------------
# Trust classification
# ---------------------------------------------------------------------------
class ProjectTrustClassification(StrEnum):
ALWAYS_TRUSTED = "always_trusted"
TRUSTED_BY_USER = "trusted_by_user"
UNTRUSTED = "untrusted"
class ProjectTrustDetector:
"""Classifies a project directory as trusted or untrusted.
Algorithm (PRD §4.1 trust note):
1. No project_dir ALWAYS_TRUSTED
2. No .git directory ALWAYS_TRUSTED (not a git repo)
3. No remote 'origin' ALWAYS_TRUSTED (local-only repo)
4. Remote URL repo_key; in TrustedRepoStore TRUSTED_BY_USER
5. Localhost remote ALWAYS_TRUSTED
6. ~/.hive/own_remotes match ALWAYS_TRUSTED
7. HIVE_OWN_REMOTES env match ALWAYS_TRUSTED
8. None of the above UNTRUSTED
"""
def __init__(self, store: TrustedRepoStore | None = None) -> None:
self._store = store or TrustedRepoStore()
def classify(self, project_dir: Path | None) -> tuple[ProjectTrustClassification, str]:
"""Return (classification, repo_key).
repo_key is empty string for ALWAYS_TRUSTED cases without a remote.
"""
if project_dir is None or not project_dir.exists():
return ProjectTrustClassification.ALWAYS_TRUSTED, ""
if not (project_dir / ".git").exists():
return ProjectTrustClassification.ALWAYS_TRUSTED, ""
remote_url = self._get_remote_origin(project_dir)
if not remote_url:
return ProjectTrustClassification.ALWAYS_TRUSTED, ""
repo_key = _normalize_remote_url(remote_url)
# Explicitly trusted by user
if self._store.is_trusted(repo_key):
return ProjectTrustClassification.TRUSTED_BY_USER, repo_key
# Localhost remotes are always trusted
if _is_localhost_remote(remote_url):
return ProjectTrustClassification.ALWAYS_TRUSTED, repo_key
# User-configured own-remote patterns
if self._matches_own_remotes(repo_key):
return ProjectTrustClassification.ALWAYS_TRUSTED, repo_key
return ProjectTrustClassification.UNTRUSTED, repo_key
def _get_remote_origin(self, project_dir: Path) -> str:
"""Run git remote get-url origin. Returns empty string on any failure."""
try:
result = subprocess.run(
["git", "-C", str(project_dir), "remote", "get-url", "origin"],
capture_output=True,
text=True,
timeout=3,
)
if result.returncode == 0:
return result.stdout.strip()
except subprocess.TimeoutExpired:
logger.warning(
"skill_trust: git remote lookup timed out for %s; treating as trusted",
project_dir,
)
except (FileNotFoundError, OSError):
pass # git not found or other OS error
return ""
def _matches_own_remotes(self, repo_key: str) -> bool:
"""Check repo_key against user-configured own-remote glob patterns."""
import fnmatch
patterns: list[str] = []
# From env var
env_patterns = _ENV_OWN_REMOTES
import os
raw = os.environ.get(env_patterns, "")
if raw:
patterns.extend(p.strip() for p in raw.split(",") if p.strip())
# From ~/.hive/own_remotes file
own_remotes_file = Path.home() / ".hive" / "own_remotes"
if own_remotes_file.is_file():
try:
for line in own_remotes_file.read_text(encoding="utf-8").splitlines():
line = line.strip()
if line and not line.startswith("#"):
patterns.append(line)
except OSError:
pass
return any(fnmatch.fnmatch(repo_key, p) for p in patterns)
# ---------------------------------------------------------------------------
# URL helpers (public so CLI can reuse)
# ---------------------------------------------------------------------------
def _normalize_remote_url(url: str) -> str:
"""Normalize a git remote URL to a canonical ``host/org/repo`` key.
Examples:
git@github.com:org/repo.git github.com/org/repo
https://github.com/org/repo github.com/org/repo
ssh://git@github.com/org/repo.git github.com/org/repo
"""
url = url.strip()
# SCP-style SSH: git@github.com:org/repo.git
if url.startswith("git@") and ":" in url and "://" not in url:
url = url[4:] # strip git@
url = url.replace(":", "/", 1)
elif "://" in url:
parsed = urlparse(url)
host = parsed.hostname or ""
path = parsed.path.lstrip("/")
url = f"{host}/{path}"
# Strip .git suffix
if url.endswith(".git"):
url = url[:-4]
return url.lower().strip("/")
def _is_localhost_remote(remote_url: str) -> bool:
"""Return True if the remote points to a local host."""
local_hosts = {"localhost", "127.0.0.1", "::1"}
try:
if "://" in remote_url:
parsed = urlparse(remote_url)
return (parsed.hostname or "").lower() in local_hosts
# SCP-style: git@localhost:org/repo
if "@" in remote_url:
host_part = remote_url.split("@", 1)[1].split(":")[0]
return host_part.lower() in local_hosts
except Exception:
pass
return False
# ---------------------------------------------------------------------------
# Trust gate
# ---------------------------------------------------------------------------
class TrustGate:
"""Filters skill list, running consent flow for untrusted project-scope skills.
Framework and user-scope skills are always allowed through.
Project-scope skills from untrusted repos require consent.
"""
def __init__(
self,
store: TrustedRepoStore | None = None,
detector: ProjectTrustDetector | None = None,
interactive: bool = True,
print_fn: Callable[[str], None] | None = None,
input_fn: Callable[[str], str] | None = None,
) -> None:
self._store = store or TrustedRepoStore()
self._detector = detector or ProjectTrustDetector(self._store)
self._interactive = interactive
self._print = print_fn or print
self._input = input_fn or input
def filter_and_gate(
self,
skills: list[ParsedSkill],
project_dir: Path | None,
) -> list[ParsedSkill]:
"""Return the subset of skills that are trusted for loading.
- Framework and user-scope skills: always included.
- Project-scope skills: classified; consent prompt shown if untrusted.
"""
import os
# Separate project skills from always-trusted scopes
always_trusted = [s for s in skills if s.source_scope != "project"]
project_skills = [s for s in skills if s.source_scope == "project"]
if not project_skills:
return always_trusted
# Env-var CI override: trust all project skills for this invocation
if os.environ.get(_ENV_TRUST_ALL, "").strip() == "1":
logger.info(
"skill_trust: %s=1 set; trusting %d project skill(s) without consent",
_ENV_TRUST_ALL,
len(project_skills),
)
return always_trusted + project_skills
classification, repo_key = self._detector.classify(project_dir)
if classification in (
ProjectTrustClassification.ALWAYS_TRUSTED,
ProjectTrustClassification.TRUSTED_BY_USER,
):
logger.info(
"skill_trust: project skills trusted classification=%s repo=%s count=%d",
classification,
repo_key or "(no remote)",
len(project_skills),
)
return always_trusted + project_skills
# UNTRUSTED — need consent
if not self._interactive or not sys.stdin.isatty():
logger.warning(
"skill_trust: skipping %d project-scope skill(s) from untrusted repo "
"'%s' (non-interactive mode). "
"To trust permanently run: hive skill trust %s",
len(project_skills),
repo_key,
project_dir or ".",
)
logger.info(
"skill_trust_decision repo=%s skills=%d decision=denied mode=headless",
repo_key,
len(project_skills),
)
return always_trusted
# Interactive consent flow
decision = self._run_consent_flow(project_skills, project_dir, repo_key)
logger.info(
"skill_trust_decision repo=%s skills=%d decision=%s mode=interactive",
repo_key,
len(project_skills),
decision,
)
if decision == "session":
return always_trusted + project_skills
if decision == "permanent":
self._store.trust(repo_key, project_path=str(project_dir or ""))
return always_trusted + project_skills
# denied
return always_trusted
def _run_consent_flow(
self,
project_skills: list[ParsedSkill],
project_dir: Path | None,
repo_key: str,
) -> str:
"""Show the security notice (once) and consent prompt.
Return 'session' | 'permanent' | 'denied'."""
from framework.credentials.setup import Colors
if not sys.stdout.isatty():
Colors.disable()
self._maybe_show_security_notice(Colors)
self._print_consent_prompt(project_skills, project_dir, repo_key, Colors)
return self._prompt_consent(Colors)
def _maybe_show_security_notice(self, Colors) -> None: # noqa: N803
"""Show the one-time security notice if not already shown (NFR-5)."""
if _NOTICE_SENTINEL_PATH.exists():
return
self._print("")
self._print(
f"{Colors.YELLOW}Security notice:{Colors.NC} Skills inject instructions "
"into the agent's system prompt."
)
self._print(
" Only load skills from sources you trust. "
"Registry skills at tier 'verified' or 'official' have been audited."
)
self._print("")
try:
_NOTICE_SENTINEL_PATH.parent.mkdir(parents=True, exist_ok=True)
_NOTICE_SENTINEL_PATH.touch()
except OSError:
pass
def _print_consent_prompt(
self,
project_skills: list[ParsedSkill],
project_dir: Path | None,
repo_key: str,
Colors, # noqa: N803
) -> None:
p = self._print
p("")
p(f"{Colors.YELLOW}{'=' * 60}{Colors.NC}")
p(f"{Colors.BOLD} SKILL TRUST REQUIRED{Colors.NC}")
p(f"{Colors.YELLOW}{'=' * 60}{Colors.NC}")
p("")
proj_label = str(project_dir) if project_dir else "this project"
p(
f" The project at {Colors.CYAN}{proj_label}{Colors.NC} wants to load "
f"{len(project_skills)} skill(s)"
)
p(" that will inject instructions into the agent's system prompt.")
if repo_key:
p(f" Source: {Colors.BOLD}{repo_key}{Colors.NC}")
p("")
p(" Skills requesting access:")
for skill in project_skills:
p(f" {Colors.CYAN}{Colors.NC} {Colors.BOLD}{skill.name}{Colors.NC}")
p(f' "{skill.description}"')
p(f" {Colors.DIM}{skill.location}{Colors.NC}")
p("")
p(" Options:")
p(f" {Colors.CYAN}1){Colors.NC} Trust this session only")
p(f" {Colors.CYAN}2){Colors.NC} Trust permanently — remember for future runs")
p(
f" {Colors.DIM}3) Deny"
f" — skip all project-scope skills from this repo{Colors.NC}"
)
p(f"{Colors.YELLOW}{'' * 60}{Colors.NC}")
def _prompt_consent(self, Colors) -> str: # noqa: N803
"""Prompt until a valid choice is entered. Returns 'session'|'permanent'|'denied'."""
mapping = {"1": "session", "2": "permanent", "3": "denied"}
while True:
try:
choice = self._input("Select option (1-3): ").strip()
if choice in mapping:
return mapping[choice]
except (KeyboardInterrupt, EOFError):
return "denied"
self._print(f"{Colors.RED}Invalid choice. Enter 1, 2, or 3.{Colors.NC}")
+1
View File
@@ -324,6 +324,7 @@ export type EventTypeName =
| "node_retry"
| "edge_traversed"
| "context_compacted"
| "context_usage_updated"
| "webhook_received"
| "custom"
| "escalation_requested"
+61 -1
View File
@@ -1,5 +1,12 @@
import { memo, useState, useRef, useEffect } from "react";
import { Send, Square, Crown, Cpu, Check, Loader2 } from "lucide-react";
export interface ContextUsageEntry {
usagePct: number;
messageCount: number;
estimatedTokens: number;
maxTokens: number;
}
import MarkdownContent from "@/components/MarkdownContent";
import QuestionWidget from "@/components/QuestionWidget";
import MultiQuestionWidget from "@/components/MultiQuestionWidget";
@@ -47,6 +54,8 @@ interface ChatPanelProps {
onQuestionDismiss?: () => void;
/** Queen operating phase — shown as a tag on queen messages */
queenPhase?: "planning" | "building" | "staging" | "running";
/** Context window usage for queen and workers */
contextUsage?: Record<string, ContextUsageEntry>;
}
const queenColor = "hsl(45,95%,58%)";
@@ -241,7 +250,7 @@ const MessageBubble = memo(function MessageBubble({ msg, queenPhase }: { msg: Ch
);
}, (prev, next) => prev.msg.id === next.msg.id && prev.msg.content === next.msg.content && prev.msg.phase === next.msg.phase && prev.queenPhase === next.queenPhase);
export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting, isBusy, activeThread, disabled, onCancel, pendingQuestion, pendingOptions, pendingQuestions, onQuestionSubmit, onMultiQuestionSubmit, onQuestionDismiss, queenPhase }: ChatPanelProps) {
export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting, isBusy, activeThread, disabled, onCancel, pendingQuestion, pendingOptions, pendingQuestions, onQuestionSubmit, onMultiQuestionSubmit, onQuestionDismiss, queenPhase, contextUsage }: ChatPanelProps) {
const [input, setInput] = useState("");
const [readMap, setReadMap] = useState<Record<string, number>>({});
const bottomRef = useRef<HTMLDivElement>(null);
@@ -356,6 +365,57 @@ export default function ChatPanel({ messages, onSend, isWaiting, isWorkerWaiting
<div ref={bottomRef} />
</div>
{/* Context window usage bar — sits between messages and input */}
{(() => {
if (!contextUsage) return null;
const queenUsage = contextUsage["__queen__"];
const workerEntries = Object.entries(contextUsage).filter(([k]) => k !== "__queen__");
const workerUsage = workerEntries.length > 0
? workerEntries.reduce((best, [, v]) => (v.usagePct > best.usagePct ? v : best), workerEntries[0][1])
: undefined;
if (!queenUsage && !workerUsage) return null;
return (
<div className="flex items-center gap-3 mx-4 px-3 py-1 rounded-lg bg-muted/30 border border-border/20 group/ctx flex-shrink-0">
{queenUsage && (
<div className="flex items-center gap-2 flex-1 min-w-0" title={`Queen: ${(queenUsage.estimatedTokens / 1000).toFixed(1)}k / ${(queenUsage.maxTokens / 1000).toFixed(0)}k tokens \u00b7 ${queenUsage.messageCount} messages`}>
<Crown className="w-3 h-3 flex-shrink-0" style={{ color: "hsl(45,95%,58%)" }} />
<div className="flex-1 h-1.5 rounded-full bg-muted/50 overflow-hidden min-w-[60px]">
<div
className="h-full rounded-full transition-all duration-500 ease-out"
style={{
width: `${Math.min(queenUsage.usagePct, 100)}%`,
backgroundColor: queenUsage.usagePct >= 90 ? "hsl(0,65%,55%)" : queenUsage.usagePct >= 70 ? "hsl(35,90%,55%)" : "hsl(45,95%,58%)",
}}
/>
</div>
<span className="text-[10px] text-muted-foreground/70 flex-shrink-0 tabular-nums">
<span className="group-hover/ctx:hidden">{queenUsage.usagePct}%</span>
<span className="hidden group-hover/ctx:inline">{(queenUsage.estimatedTokens / 1000).toFixed(1)}k / {(queenUsage.maxTokens / 1000).toFixed(0)}k</span>
</span>
</div>
)}
{workerUsage && (
<div className="flex items-center gap-2 flex-1 min-w-0" title={`Worker: ${(workerUsage.estimatedTokens / 1000).toFixed(1)}k / ${(workerUsage.maxTokens / 1000).toFixed(0)}k tokens \u00b7 ${workerUsage.messageCount} messages`}>
<Cpu className="w-3 h-3 flex-shrink-0" style={{ color: "hsl(220,60%,55%)" }} />
<div className="flex-1 h-1.5 rounded-full bg-muted/50 overflow-hidden min-w-[60px]">
<div
className="h-full rounded-full transition-all duration-500 ease-out"
style={{
width: `${Math.min(workerUsage.usagePct, 100)}%`,
backgroundColor: workerUsage.usagePct >= 90 ? "hsl(0,65%,55%)" : workerUsage.usagePct >= 70 ? "hsl(35,90%,55%)" : "hsl(220,60%,55%)",
}}
/>
</div>
<span className="text-[10px] text-muted-foreground/70 flex-shrink-0 tabular-nums">
<span className="group-hover/ctx:hidden">{workerUsage.usagePct}%</span>
<span className="hidden group-hover/ctx:inline">{(workerUsage.estimatedTokens / 1000).toFixed(1)}k / {(workerUsage.maxTokens / 1000).toFixed(0)}k</span>
</span>
</div>
)}
</div>
);
})()}
{/* Input area — question widget replaces textarea when a question is pending */}
{pendingQuestions && pendingQuestions.length >= 2 && onMultiQuestionSubmit ? (
<MultiQuestionWidget
@@ -28,6 +28,13 @@ export interface SubagentReport {
status?: "running" | "complete" | "error";
}
interface ContextUsage {
usagePct: number;
messageCount: number;
estimatedTokens: number;
maxTokens: number;
}
interface NodeDetailPanelProps {
node: GraphNode | null;
nodeSpec?: NodeSpec | null;
@@ -38,6 +45,7 @@ interface NodeDetailPanelProps {
workerSessionId?: string | null;
nodeLogs?: string[];
actionPlan?: string;
contextUsage?: ContextUsage;
onClose: () => void;
}
@@ -309,7 +317,7 @@ const tabs: { id: Tab; label: string; Icon: React.FC<{ className?: string }> }[]
{ id: "subagents", label: "Subagents", Icon: ({ className }) => <Bot className={className} /> },
];
export default function NodeDetailPanel({ node, nodeSpec, allNodeSpecs, subagentReports, sessionId, graphId, workerSessionId, nodeLogs, actionPlan, onClose }: NodeDetailPanelProps) {
export default function NodeDetailPanel({ node, nodeSpec, allNodeSpecs, subagentReports, sessionId, graphId, workerSessionId, nodeLogs, actionPlan, contextUsage, onClose }: NodeDetailPanelProps) {
const [activeTab, setActiveTab] = useState<Tab>("overview");
const [realTools, setRealTools] = useState<ToolInfo[] | null>(null);
const [realCriteria, setRealCriteria] = useState<NodeCriteria | null>(null);
@@ -389,6 +397,43 @@ export default function NodeDetailPanel({ node, nodeSpec, allNodeSpecs, subagent
</div>
)}
{/* Context window usage */}
{contextUsage && (
<div className="px-4 py-2 border-b border-border/20 flex-shrink-0">
<div className="flex items-center gap-2 mb-1">
<span className="text-[10px] text-muted-foreground font-medium">Context</span>
<span className="text-[10px] text-muted-foreground/70 ml-auto">
{(contextUsage.estimatedTokens / 1000).toFixed(1)}k / {(contextUsage.maxTokens / 1000).toFixed(0)}k tokens
</span>
</div>
<div className="w-full h-1.5 rounded-full bg-muted/50 overflow-hidden">
<div
className="h-full rounded-full transition-all duration-500 ease-out"
style={{
width: `${Math.min(contextUsage.usagePct, 100)}%`,
backgroundColor: contextUsage.usagePct >= 90
? "hsl(0,65%,55%)"
: contextUsage.usagePct >= 70
? "hsl(35,90%,55%)"
: "hsl(45,95%,58%)",
}}
/>
</div>
<div className="flex items-center gap-2 mt-1">
<span className="text-[10px] text-muted-foreground/60">{contextUsage.messageCount} messages</span>
<span className="text-[10px] font-medium ml-auto" style={{
color: contextUsage.usagePct >= 90
? "hsl(0,65%,55%)"
: contextUsage.usagePct >= 70
? "hsl(35,90%,55%)"
: "hsl(45,95%,58%)",
}}>
{contextUsage.usagePct}%
</span>
</div>
</div>
)}
{/* Tab bar */}
<div className="flex border-b border-border/30 flex-shrink-0 px-2 pt-1 overflow-x-auto scrollbar-hide">
{tabs.filter(t => t.id !== "subagents" || (nodeSpec?.sub_agents && nodeSpec.sub_agents.length > 0)).map(tab => (
+53 -6
View File
@@ -352,6 +352,8 @@ interface AgentBackendState {
pendingQuestions: { id: string; prompt: string; options?: string[] }[] | null;
/** Whether the pending question came from queen or worker */
pendingQuestionSource: "queen" | "worker" | null;
/** Per-node context window usage (from context_usage_updated events) */
contextUsage: Record<string, { usagePct: number; messageCount: number; estimatedTokens: number; maxTokens: number }>;
}
function defaultAgentState(): AgentBackendState {
@@ -389,6 +391,7 @@ function defaultAgentState(): AgentBackendState {
pendingOptions: null,
pendingQuestions: null,
pendingQuestionSource: null,
contextUsage: {},
};
}
@@ -630,6 +633,10 @@ export default function Workspace() {
// it was created in (avoids stale-closure when phase change and message
// events arrive in the same React batch).
const queenPhaseRef = useRef<Record<string, string>>({});
// Accumulated queen text across inner_turns within the same iteration.
// Key: `${agentType}:${execution_id}:${iteration}`, value: { [inner_turn]: snapshot }.
// This lets us merge all inner_turn text into one chat bubble per iteration.
const queenIterTextRef = useRef<Record<string, Record<number, string>>>({});
// Timestamp when designingDraft was set — used to enforce minimum spinner duration.
const designingDraftSinceRef = useRef<Record<string, number>>({});
const designingDraftTimerRef = useRef<Record<string, ReturnType<typeof setTimeout>>>({});
@@ -1707,14 +1714,29 @@ export default function Workspace() {
if (isQueen) console.log('[QUEEN] chatMsg:', chatMsg?.id, chatMsg?.content?.slice(0, 50), 'turn:', currentTurn);
if (chatMsg && !suppressQueenMessages) {
// Queen emits multiple client_output_delta / llm_text_delta snapshots
// across iterations and inner tool-loop turns. Build a stable ID that
// groups streaming deltas for the *same* output (same execution +
// iteration + inner_turn) into one bubble, while keeping distinct
// outputs as separate bubbles so earlier text isn't overwritten.
// across iterations and inner tool-loop turns. Merge all inner_turns
// within the same iteration into ONE bubble so the queen's multi-step
// tool loop (text → tool → text → tool → text) appears as one cohesive
// message rather than many small fragments.
if (isQueen && (event.type === "client_output_delta" || event.type === "llm_text_delta") && event.execution_id) {
const iter = event.data?.iteration ?? 0;
const inner = event.data?.inner_turn ?? 0;
chatMsg.id = `queen-stream-${event.execution_id}-${iter}-${inner}`;
const inner = (event.data?.inner_turn as number) ?? 0;
const iterKey = `${agentType}:${event.execution_id}:${iter}`;
// Store the latest snapshot for this inner_turn
if (!queenIterTextRef.current[iterKey]) {
queenIterTextRef.current[iterKey] = {};
}
const snapshot = (event.data?.snapshot as string) || (event.data?.content as string) || "";
queenIterTextRef.current[iterKey][inner] = snapshot;
// Concatenate all inner_turn snapshots in order
const parts = queenIterTextRef.current[iterKey];
const sortedInners = Object.keys(parts).map(Number).sort((a, b) => a - b);
chatMsg.content = sortedInners.map(k => parts[k]).join("\n");
// Single ID per iteration — no inner_turn in the ID
chatMsg.id = `queen-stream-${event.execution_id}-${iter}`;
}
if (isQueen) {
chatMsg.role = role;
@@ -2136,6 +2158,29 @@ export default function Workspace() {
}
break;
case "context_usage_updated": {
const streamKey = isQueen ? "__queen__" : (event.node_id || streamId);
const usagePct = (event.data?.usage_pct as number) ?? 0;
const messageCount = (event.data?.message_count as number) ?? 0;
const estimatedTokens = (event.data?.estimated_tokens as number) ?? 0;
const maxTokens = (event.data?.max_context_tokens as number) ?? 0;
setAgentStates(prev => {
const state = prev[agentType];
if (!state) return prev;
return {
...prev,
[agentType]: {
...state,
contextUsage: {
...state.contextUsage,
[streamKey]: { usagePct, messageCount, estimatedTokens, maxTokens },
},
},
};
});
}
break;
case "node_action_plan":
if (!isQueen && event.node_id) {
const plan = (event.data?.plan as string) || "";
@@ -3174,6 +3219,7 @@ export default function Workspace() {
}
onMultiQuestionSubmit={handleMultiQuestionAnswer}
onQuestionDismiss={handleQuestionDismiss}
contextUsage={activeAgentState?.contextUsage}
/>
)}
</div>
@@ -3377,6 +3423,7 @@ export default function Workspace() {
workerSessionId={null}
nodeLogs={activeAgentState?.nodeLogs[resolvedSelectedNode.id] || []}
actionPlan={activeAgentState?.nodeActionPlans[resolvedSelectedNode.id]}
contextUsage={activeAgentState?.contextUsage[resolvedSelectedNode.id]}
onClose={() => setSelectedNode(null)}
/>
)}
+142
View File
@@ -0,0 +1,142 @@
"""Tests for AS-9: Skill directory allowlisting in file-read tool interception."""
from unittest.mock import MagicMock
import pytest
from framework.llm.provider import ToolResult
def _make_tool_call_event(tool_name: str, path: str):
"""Build a minimal ToolCallEvent-like object."""
tc = MagicMock()
tc.tool_use_id = "tc-1"
tc.tool_name = tool_name
tc.tool_input = {"path": path}
return tc
def _make_node(skill_dirs: list[str]):
"""Build a minimal EventLoopNode with skill_dirs set."""
from framework.graph.event_loop_node import EventLoopNode
mock_result = ToolResult(tool_use_id="tc-1", content="from-executor")
node = EventLoopNode(tool_executor=MagicMock(return_value=mock_result))
node._skill_dirs = skill_dirs
return node
class TestSkillFileReadInterception:
@pytest.mark.asyncio
async def test_reads_file_in_skill_dir(self, tmp_path):
"""File under a skill dir is read directly, bypassing the executor."""
skill_dir = tmp_path / "my-skill"
skill_dir.mkdir()
script = skill_dir / "scripts" / "run.py"
script.parent.mkdir()
script.write_text("print('hello')")
node = _make_node([str(skill_dir)])
tc = _make_tool_call_event("view_file", str(script))
result = await node._execute_tool(tc)
assert result.content == "print('hello')"
assert not result.is_error
node._tool_executor.assert_not_called()
@pytest.mark.asyncio
async def test_skill_md_read_marked_as_skill_content(self, tmp_path):
"""Reading SKILL.md sets is_skill_content=True for AS-10 protection."""
skill_dir = tmp_path / "my-skill"
skill_dir.mkdir()
skill_md = skill_dir / "SKILL.md"
skill_md.write_text("---\nname: my-skill\ndescription: Test\n---\nInstructions.")
node = _make_node([str(skill_dir)])
tc = _make_tool_call_event("view_file", str(skill_md))
result = await node._execute_tool(tc)
assert result.is_skill_content is True
assert not result.is_error
@pytest.mark.asyncio
async def test_non_skill_md_resource_not_marked(self, tmp_path):
"""Bundled resource (not SKILL.md) is NOT marked as skill_content."""
skill_dir = tmp_path / "my-skill"
skill_dir.mkdir()
ref = skill_dir / "references" / "api.md"
ref.parent.mkdir()
ref.write_text("# API Reference")
node = _make_node([str(skill_dir)])
tc = _make_tool_call_event("load_data", str(ref))
result = await node._execute_tool(tc)
assert result.is_skill_content is False
assert not result.is_error
@pytest.mark.asyncio
async def test_path_outside_skill_dir_goes_to_executor(self, tmp_path):
"""Path outside skill dirs is passed through to the executor unchanged."""
skill_dir = tmp_path / "my-skill"
skill_dir.mkdir()
other_file = tmp_path / "other" / "file.txt"
other_file.parent.mkdir()
other_file.write_text("other content")
node = _make_node([str(skill_dir)])
tc = _make_tool_call_event("view_file", str(other_file))
result = await node._execute_tool(tc)
assert result.content == "from-executor"
node._tool_executor.assert_called_once()
@pytest.mark.asyncio
async def test_no_skill_dirs_goes_to_executor(self, tmp_path):
"""When skill_dirs is empty, all tool calls go to executor."""
skill_dir = tmp_path / "my-skill"
skill_dir.mkdir()
script = skill_dir / "scripts" / "run.py"
script.parent.mkdir()
script.write_text("print('hello')")
node = _make_node([])
tc = _make_tool_call_event("view_file", str(script))
result = await node._execute_tool(tc)
assert result.content == "from-executor"
node._tool_executor.assert_called_once()
@pytest.mark.asyncio
async def test_missing_file_returns_error(self, tmp_path):
"""Non-existent file under skill dir returns is_error=True."""
skill_dir = tmp_path / "my-skill"
skill_dir.mkdir()
missing = skill_dir / "scripts" / "missing.py"
node = _make_node([str(skill_dir)])
tc = _make_tool_call_event("view_file", str(missing))
result = await node._execute_tool(tc)
assert result.is_error is True
assert "Could not read skill resource" in result.content
@pytest.mark.asyncio
async def test_non_file_read_tool_goes_to_executor(self, tmp_path):
"""Non file-read tools (e.g. web_search) bypass the interceptor."""
skill_dir = tmp_path / "my-skill"
skill_dir.mkdir()
node = _make_node([str(skill_dir)])
tc = _make_tool_call_event("web_search", str(skill_dir / "SKILL.md"))
result = await node._execute_tool(tc)
assert result.content == "from-executor"
node._tool_executor.assert_called_once()
+8 -1
View File
@@ -69,7 +69,13 @@ class TestSkillCatalog:
def test_to_prompt_xml_generation(self):
skills = [
_make_skill("alpha", "Alpha skill", "project", location="/p/alpha/SKILL.md"),
_make_skill(
"alpha",
"Alpha skill",
"project",
location="/p/alpha/SKILL.md",
base_dir="/p/alpha",
),
_make_skill("beta", "Beta skill", "user", location="/u/beta/SKILL.md"),
]
catalog = SkillCatalog(skills)
@@ -81,6 +87,7 @@ class TestSkillCatalog:
assert "<name>beta</name>" in prompt
assert "<description>Alpha skill</description>" in prompt
assert "<location>/p/alpha/SKILL.md</location>" in prompt
assert "<base_dir>/p/alpha</base_dir>" in prompt
def test_to_prompt_sorted_by_name(self):
skills = [
@@ -0,0 +1,90 @@
"""Tests for AS-10: Activated skill content protected from context pruning."""
import pytest
from framework.graph.conversation import Message, NodeConversation
def _make_conversation() -> NodeConversation:
conv = NodeConversation.__new__(NodeConversation)
conv._messages = []
conv._next_seq = 0
conv._current_phase = None
conv._store = None
return conv
async def _add_tool_msg(conv: NodeConversation, content: str, **kwargs) -> Message:
return await conv.add_tool_result(
tool_use_id=f"tc-{conv._next_seq}",
content=content,
**kwargs,
)
class TestSkillContentProtection:
@pytest.mark.asyncio
async def test_is_skill_content_flag_persists(self):
"""Message created with is_skill_content=True retains the flag."""
conv = _make_conversation()
msg = await _add_tool_msg(conv, "skill instructions", is_skill_content=True)
assert msg.is_skill_content is True
@pytest.mark.asyncio
async def test_regular_message_not_marked(self):
"""Normal tool result messages are not marked as skill content."""
conv = _make_conversation()
msg = await _add_tool_msg(conv, "some tool output")
assert msg.is_skill_content is False
@pytest.mark.asyncio
async def test_skill_content_survives_prune(self):
"""Skill content messages are skipped by prune_old_tool_results."""
conv = _make_conversation()
# Add many regular tool results to push over prune threshold
for _ in range(30):
await _add_tool_msg(conv, "x" * 500) # ~125 tokens each
# Add a skill content message
skill_msg = await _add_tool_msg(
conv,
"## Deep Research\n" + "instructions " * 200,
is_skill_content=True,
)
pruned = await conv.prune_old_tool_results(protect_tokens=500, min_prune_tokens=100)
assert pruned > 0, "Expected some messages to be pruned"
# Find the skill message — it must not be pruned
matching = [m for m in conv._messages if m.seq == skill_msg.seq]
assert matching, "Skill content message was removed"
assert not matching[0].content.startswith("[Pruned tool result")
@pytest.mark.asyncio
async def test_regular_content_can_be_pruned(self):
"""Regular tool results are still pruned when over threshold."""
conv = _make_conversation()
for _ in range(20):
await _add_tool_msg(conv, "regular tool output " * 50)
pruned = await conv.prune_old_tool_results(protect_tokens=500, min_prune_tokens=100)
assert pruned > 0, "Expected regular messages to be pruned"
@pytest.mark.asyncio
async def test_error_messages_also_protected(self):
"""Existing is_error protection still works alongside is_skill_content."""
conv = _make_conversation()
for _ in range(20):
await _add_tool_msg(conv, "output " * 100)
err_msg = await _add_tool_msg(conv, "tool failed", is_error=True)
await conv.prune_old_tool_results(protect_tokens=200, min_prune_tokens=50)
matching = [m for m in conv._messages if m.seq == err_msg.seq]
assert matching
assert not matching[0].content.startswith("[Pruned tool result")
+92
View File
@@ -0,0 +1,92 @@
"""Tests for AS-6 skill resource loading support.
Covers:
- <base_dir> element in catalog XML
- allowlisted_dirs property reflects trusted skill base directories
- skill_dirs propagation to NodeContext
"""
from framework.skills.catalog import SkillCatalog
from framework.skills.parser import ParsedSkill
def _make_skill(
name: str,
base_dir: str,
source_scope: str = "project",
) -> ParsedSkill:
return ParsedSkill(
name=name,
description=f"Skill {name}",
location=f"{base_dir}/SKILL.md",
base_dir=base_dir,
source_scope=source_scope,
body="Instructions.",
)
class TestSkillResourceBaseDir:
def test_base_dir_in_xml(self):
"""Each community skill entry should expose its base_dir in the catalog XML."""
skill = _make_skill("deploy", "/project/.hive/skills/deploy")
catalog = SkillCatalog([skill])
prompt = catalog.to_prompt()
assert "<base_dir>/project/.hive/skills/deploy</base_dir>" in prompt
def test_base_dir_xml_escaped(self):
"""base_dir with XML-special chars should be escaped."""
skill = _make_skill("s", "/path/with <&> chars")
catalog = SkillCatalog([skill])
prompt = catalog.to_prompt()
assert "<base_dir>/path/with &lt;&amp;&gt; chars</base_dir>" in prompt
def test_base_dir_absent_for_framework_skills(self):
"""Framework-scope skills are filtered from the catalog, so no base_dir either."""
skill = _make_skill("fw", "/hive/_default_skills/fw", source_scope="framework")
catalog = SkillCatalog([skill])
assert catalog.to_prompt() == ""
def test_allowlisted_dirs_matches_skills(self):
"""allowlisted_dirs returns all skill base_dirs including framework ones."""
skills = [
_make_skill("a", "/skills/a", "project"),
_make_skill("b", "/skills/b", "user"),
_make_skill("c", "/skills/c", "framework"),
]
catalog = SkillCatalog(skills)
dirs = catalog.allowlisted_dirs
assert "/skills/a" in dirs
assert "/skills/b" in dirs
assert "/skills/c" in dirs
def test_allowlisted_dirs_empty_catalog(self):
assert SkillCatalog().allowlisted_dirs == []
class TestSkillDirsPropagation:
def _make_ctx(self, **kwargs):
from unittest.mock import MagicMock
from framework.graph.node import NodeContext
return NodeContext(
runtime=MagicMock(),
node_id="n",
node_spec=MagicMock(),
memory={},
**kwargs,
)
def test_node_context_skill_dirs_default(self):
"""NodeContext.skill_dirs defaults to empty list."""
ctx = self._make_ctx()
assert ctx.skill_dirs == []
def test_node_context_skill_dirs_set(self):
"""NodeContext.skill_dirs can be populated."""
dirs = ["/skills/a", "/skills/b"]
ctx = self._make_ctx(skill_dirs=dirs)
assert ctx.skill_dirs == dirs
+471
View File
@@ -0,0 +1,471 @@
"""Tests for skill trust gating (AS-13)."""
from __future__ import annotations
import json
from unittest.mock import MagicMock, patch
from framework.skills.parser import ParsedSkill
from framework.skills.trust import (
ProjectTrustClassification,
ProjectTrustDetector,
TrustedRepoStore,
TrustGate,
_is_localhost_remote,
_normalize_remote_url,
)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def make_skill(name: str = "test-skill", scope: str = "project") -> ParsedSkill:
return ParsedSkill(
name=name,
description="Test skill",
location=f"/fake/{name}/SKILL.md",
base_dir=f"/fake/{name}",
source_scope=scope,
body="Test skill instructions.",
)
# ---------------------------------------------------------------------------
# _normalize_remote_url
# ---------------------------------------------------------------------------
class TestNormalizeRemoteUrl:
def test_ssh_scp_format(self):
assert _normalize_remote_url("git@github.com:org/repo.git") == "github.com/org/repo"
def test_https_format(self):
assert _normalize_remote_url("https://github.com/org/repo.git") == "github.com/org/repo"
def test_https_no_dot_git(self):
assert _normalize_remote_url("https://github.com/org/repo") == "github.com/org/repo"
def test_ssh_url_format(self):
assert _normalize_remote_url("ssh://git@github.com/org/repo.git") == "github.com/org/repo"
def test_lowercased(self):
assert _normalize_remote_url("git@GitHub.COM:Org/Repo.git") == "github.com/org/repo"
def test_trailing_slash_stripped(self):
assert _normalize_remote_url("https://github.com/org/repo/") == "github.com/org/repo"
def test_gitlab(self):
assert _normalize_remote_url("git@gitlab.com:team/project.git") == "gitlab.com/team/project"
# ---------------------------------------------------------------------------
# _is_localhost_remote
# ---------------------------------------------------------------------------
class TestIsLocalhostRemote:
def test_localhost_https(self):
assert _is_localhost_remote("http://localhost/org/repo")
def test_127_0_0_1(self):
assert _is_localhost_remote("https://127.0.0.1/repo")
def test_github_not_local(self):
assert not _is_localhost_remote("https://github.com/org/repo")
def test_scp_localhost(self):
assert _is_localhost_remote("git@localhost:org/repo")
# ---------------------------------------------------------------------------
# TrustedRepoStore
# ---------------------------------------------------------------------------
class TestTrustedRepoStore:
def test_empty_store_is_not_trusted(self, tmp_path):
store = TrustedRepoStore(tmp_path / "trusted.json")
assert not store.is_trusted("github.com/org/repo")
def test_trust_and_lookup(self, tmp_path):
store = TrustedRepoStore(tmp_path / "trusted.json")
store.trust("github.com/org/repo", project_path="/some/path")
assert store.is_trusted("github.com/org/repo")
def test_revoke(self, tmp_path):
store = TrustedRepoStore(tmp_path / "trusted.json")
store.trust("github.com/org/repo")
assert store.revoke("github.com/org/repo")
assert not store.is_trusted("github.com/org/repo")
def test_revoke_nonexistent_returns_false(self, tmp_path):
store = TrustedRepoStore(tmp_path / "trusted.json")
assert not store.revoke("github.com/nobody/nowhere")
def test_persists_across_instances(self, tmp_path):
path = tmp_path / "trusted.json"
store1 = TrustedRepoStore(path)
store1.trust("github.com/org/repo")
store2 = TrustedRepoStore(path)
assert store2.is_trusted("github.com/org/repo")
def test_atomic_write(self, tmp_path):
"""Save must not leave a .tmp file behind."""
path = tmp_path / "trusted.json"
store = TrustedRepoStore(path)
store.trust("github.com/org/repo")
assert not (tmp_path / "trusted.tmp").exists()
assert path.exists()
def test_corrupted_json_recovers_gracefully(self, tmp_path):
path = tmp_path / "trusted.json"
path.write_text("{not valid json{{", encoding="utf-8")
store = TrustedRepoStore(path)
assert not store.is_trusted("github.com/any/repo") # no crash
def test_json_schema(self, tmp_path):
path = tmp_path / "trusted.json"
store = TrustedRepoStore(path)
store.trust("github.com/org/repo", project_path="/work/repo")
data = json.loads(path.read_text())
assert data["version"] == 1
assert data["entries"][0]["repo_key"] == "github.com/org/repo"
assert "added_at" in data["entries"][0]
def test_list_entries(self, tmp_path):
store = TrustedRepoStore(tmp_path / "t.json")
store.trust("github.com/a/b")
store.trust("github.com/c/d")
entries = store.list_entries()
assert len(entries) == 2
# ---------------------------------------------------------------------------
# ProjectTrustDetector
# ---------------------------------------------------------------------------
class TestProjectTrustDetector:
def test_none_project_dir_always_trusted(self, tmp_path):
store = TrustedRepoStore(tmp_path / "t.json")
det = ProjectTrustDetector(store)
cls, _ = det.classify(None)
assert cls == ProjectTrustClassification.ALWAYS_TRUSTED
def test_nonexistent_dir_always_trusted(self, tmp_path):
store = TrustedRepoStore(tmp_path / "t.json")
det = ProjectTrustDetector(store)
cls, _ = det.classify(tmp_path / "nonexistent")
assert cls == ProjectTrustClassification.ALWAYS_TRUSTED
def test_no_git_dir_always_trusted(self, tmp_path):
store = TrustedRepoStore(tmp_path / "t.json")
det = ProjectTrustDetector(store)
cls, _ = det.classify(tmp_path)
assert cls == ProjectTrustClassification.ALWAYS_TRUSTED
def test_no_remote_always_trusted(self, tmp_path):
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
det = ProjectTrustDetector(store)
# git command returns non-zero (no remote)
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=1, stdout="")
cls, _ = det.classify(tmp_path)
assert cls == ProjectTrustClassification.ALWAYS_TRUSTED
def test_localhost_remote_always_trusted(self, tmp_path):
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
det = ProjectTrustDetector(store)
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0, stdout="http://localhost/org/repo.git\n"
)
cls, _ = det.classify(tmp_path)
assert cls == ProjectTrustClassification.ALWAYS_TRUSTED
def test_trusted_by_store(self, tmp_path):
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
store.trust("github.com/trusted/repo")
det = ProjectTrustDetector(store)
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0, stdout="git@github.com:trusted/repo.git\n"
)
cls, key = det.classify(tmp_path)
assert cls == ProjectTrustClassification.TRUSTED_BY_USER
assert key == "github.com/trusted/repo"
def test_unknown_remote_untrusted(self, tmp_path):
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
det = ProjectTrustDetector(store)
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0, stdout="https://github.com/stranger/repo.git\n"
)
cls, key = det.classify(tmp_path)
assert cls == ProjectTrustClassification.UNTRUSTED
assert key == "github.com/stranger/repo"
def test_own_remotes_env_var(self, tmp_path, monkeypatch):
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
monkeypatch.setenv("HIVE_OWN_REMOTES", "github.com/myorg/*")
det = ProjectTrustDetector(store)
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
returncode=0, stdout="git@github.com:myorg/myrepo.git\n"
)
cls, _ = det.classify(tmp_path)
assert cls == ProjectTrustClassification.ALWAYS_TRUSTED
def test_git_timeout_treated_as_trusted(self, tmp_path):
import subprocess
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
det = ProjectTrustDetector(store)
with patch("subprocess.run", side_effect=subprocess.TimeoutExpired("git", 3)):
cls, _ = det.classify(tmp_path)
assert cls == ProjectTrustClassification.ALWAYS_TRUSTED
def test_git_not_found_treated_as_trusted(self, tmp_path):
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
det = ProjectTrustDetector(store)
with patch("subprocess.run", side_effect=FileNotFoundError("git not found")):
cls, _ = det.classify(tmp_path)
assert cls == ProjectTrustClassification.ALWAYS_TRUSTED
# ---------------------------------------------------------------------------
# TrustGate
# ---------------------------------------------------------------------------
class TestTrustGate:
def test_framework_scope_always_passes(self, tmp_path):
skill = make_skill("fw-skill", "framework")
gate = TrustGate(store=TrustedRepoStore(tmp_path / "t.json"), interactive=False)
result = gate.filter_and_gate([skill], project_dir=None)
assert any(s.name == "fw-skill" for s in result)
def test_user_scope_always_passes(self, tmp_path):
skill = make_skill("user-skill", "user")
gate = TrustGate(store=TrustedRepoStore(tmp_path / "t.json"), interactive=False)
result = gate.filter_and_gate([skill], project_dir=None)
assert any(s.name == "user-skill" for s in result)
def test_no_project_skills_returns_early(self, tmp_path):
"""When there are no project-scope skills, trust detection is skipped."""
fw = make_skill("fw", "framework")
gate = TrustGate(store=TrustedRepoStore(tmp_path / "t.json"), interactive=False)
result = gate.filter_and_gate([fw], project_dir=tmp_path)
assert result == [fw]
def test_trusted_project_skills_pass(self, tmp_path):
"""Project skills from a trusted repo pass through."""
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
store.trust("github.com/trusted/repo")
skill = make_skill("proj-skill", "project")
gate = TrustGate(store=store, interactive=False)
with patch("subprocess.run") as m:
m.return_value = MagicMock(returncode=0, stdout="git@github.com:trusted/repo.git\n")
result = gate.filter_and_gate([skill], project_dir=tmp_path)
assert any(s.name == "proj-skill" for s in result)
def test_untrusted_headless_skips_and_logs(self, tmp_path, caplog):
"""In non-interactive mode, untrusted project skills are skipped."""
import logging
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
skill = make_skill("evil-skill", "project")
gate = TrustGate(store=store, interactive=False)
with patch("subprocess.run") as m:
m.return_value = MagicMock(
returncode=0, stdout="https://github.com/stranger/evil.git\n"
)
with caplog.at_level(logging.WARNING):
result = gate.filter_and_gate([skill], project_dir=tmp_path)
assert not any(s.name == "evil-skill" for s in result)
assert "untrusted" in caplog.text.lower() or "skipping" in caplog.text.lower()
def test_interactive_consent_session_only(self, tmp_path):
"""Option 1 (session only) includes skills without writing to store."""
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
skill = make_skill("session-skill", "project")
outputs = []
gate = TrustGate(
store=store,
interactive=True,
print_fn=outputs.append,
input_fn=lambda _: "1", # trust this session
)
with (
patch("sys.stdin.isatty", return_value=True),
patch("sys.stdout.isatty", return_value=True),
patch("subprocess.run") as m,
):
m.return_value = MagicMock(
returncode=0, stdout="https://github.com/stranger/repo.git\n"
)
result = gate.filter_and_gate([skill], project_dir=tmp_path)
assert any(s.name == "session-skill" for s in result)
# Must NOT persist to trusted store
assert not store.is_trusted("github.com/stranger/repo")
def test_interactive_consent_permanent(self, tmp_path):
"""Option 2 (permanent) includes skills and persists to trusted store."""
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
skill = make_skill("perm-skill", "project")
gate = TrustGate(
store=store,
interactive=True,
print_fn=lambda _: None,
input_fn=lambda _: "2", # trust permanently
)
with (
patch("sys.stdin.isatty", return_value=True),
patch("sys.stdout.isatty", return_value=True),
patch("subprocess.run") as m,
):
m.return_value = MagicMock(
returncode=0, stdout="https://github.com/stranger/repo.git\n"
)
result = gate.filter_and_gate([skill], project_dir=tmp_path)
assert any(s.name == "perm-skill" for s in result)
assert store.is_trusted("github.com/stranger/repo")
def test_interactive_consent_deny(self, tmp_path):
"""Option 3 (deny) excludes project skills."""
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
skill = make_skill("bad-skill", "project")
gate = TrustGate(
store=store,
interactive=True,
print_fn=lambda _: None,
input_fn=lambda _: "3", # deny
)
with (
patch("sys.stdin.isatty", return_value=True),
patch("sys.stdout.isatty", return_value=True),
patch("subprocess.run") as m,
):
m.return_value = MagicMock(
returncode=0, stdout="https://github.com/stranger/repo.git\n"
)
result = gate.filter_and_gate([skill], project_dir=tmp_path)
assert not any(s.name == "bad-skill" for s in result)
def test_env_var_override_trusts_all(self, tmp_path, monkeypatch):
"""HIVE_TRUST_PROJECT_SKILLS=1 bypasses gating entirely."""
monkeypatch.setenv("HIVE_TRUST_PROJECT_SKILLS", "1")
store = TrustedRepoStore(tmp_path / "t.json")
skill = make_skill("env-skill", "project")
gate = TrustGate(store=store, interactive=False)
result = gate.filter_and_gate([skill], project_dir=tmp_path)
assert any(s.name == "env-skill" for s in result)
def test_keyboard_interrupt_treated_as_deny(self, tmp_path):
"""Ctrl-C during consent prompt should deny cleanly."""
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
skill = make_skill("interrupted-skill", "project")
gate = TrustGate(
store=store,
interactive=True,
print_fn=lambda _: None,
input_fn=lambda _: (_ for _ in ()).throw(KeyboardInterrupt()),
)
with (
patch("sys.stdin.isatty", return_value=True),
patch("sys.stdout.isatty", return_value=True),
patch("subprocess.run") as m,
):
m.return_value = MagicMock(
returncode=0, stdout="https://github.com/stranger/repo.git\n"
)
result = gate.filter_and_gate([skill], project_dir=tmp_path)
assert not any(s.name == "interrupted-skill" for s in result)
def test_security_notice_shown_once(self, tmp_path, monkeypatch):
"""Security notice (NFR-5) should be shown the first time only."""
# Use a temp sentinel path
sentinel = tmp_path / ".skill_trust_notice_shown"
monkeypatch.setattr("framework.skills.trust._NOTICE_SENTINEL_PATH", sentinel)
assert not sentinel.exists()
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
skill = make_skill("notice-skill", "project")
output_lines: list[str] = []
gate = TrustGate(
store=store,
interactive=True,
print_fn=output_lines.append,
input_fn=lambda _: "3",
)
with (
patch("sys.stdin.isatty", return_value=True),
patch("sys.stdout.isatty", return_value=True),
patch("subprocess.run") as m,
):
m.return_value = MagicMock(
returncode=0, stdout="https://github.com/stranger/repo.git\n"
)
gate.filter_and_gate([skill], project_dir=tmp_path)
assert sentinel.exists()
assert any("Security notice" in line for line in output_lines)
# Second run should NOT show the notice again
output_lines.clear()
skill2 = make_skill("notice-skill-2", "project")
with (
patch("sys.stdin.isatty", return_value=True),
patch("sys.stdout.isatty", return_value=True),
patch("subprocess.run") as m,
):
m.return_value = MagicMock(
returncode=0, stdout="https://github.com/stranger/repo.git\n"
)
gate.filter_and_gate([skill2], project_dir=tmp_path)
assert not any("Security notice" in line for line in output_lines)
def test_mixed_scopes_only_project_gated(self, tmp_path, monkeypatch):
"""Framework and user skills should pass through even if project skills are denied."""
(tmp_path / ".git").mkdir()
store = TrustedRepoStore(tmp_path / "t.json")
fw_skill = make_skill("fw", "framework")
user_skill = make_skill("usr", "user")
proj_skill = make_skill("proj", "project")
gate = TrustGate(
store=store,
interactive=True,
print_fn=lambda _: None,
input_fn=lambda _: "3", # deny project skills
)
with (
patch("sys.stdin.isatty", return_value=True),
patch("sys.stdout.isatty", return_value=True),
patch("subprocess.run") as m,
):
m.return_value = MagicMock(
returncode=0, stdout="https://github.com/stranger/repo.git\n"
)
result = gate.filter_and_gate([fw_skill, user_skill, proj_skill], project_dir=tmp_path)
names = {s.name for s in result}
assert "fw" in names
assert "usr" in names
assert "proj" not in names
+30 -5
View File
@@ -152,7 +152,8 @@ def test_register_mcp_server_uses_connection_manager_when_enabled(monkeypatch):
assert client.disconnect_calls == 0
def test_register_mcp_server_defaults_to_direct_client_behavior(monkeypatch):
def test_register_mcp_server_defaults_to_connection_manager(monkeypatch):
"""Default behavior uses the connection manager (reuse enabled by default)."""
registry = ToolRegistry()
created_clients: list[_RegistryFakeClient] = []
@@ -161,13 +162,16 @@ def test_register_mcp_server_defaults_to_direct_client_behavior(monkeypatch):
created_clients.append(client)
return client
def fail_if_manager_used():
raise AssertionError("connection manager should not be used by default")
class FakeManager:
def acquire(self, config):
return fake_client_factory(config)
def release(self, server_name):
pass
monkeypatch.setattr("framework.runner.mcp_client.MCPClient", fake_client_factory)
monkeypatch.setattr(
"framework.runner.mcp_connection_manager.MCPConnectionManager.get_instance",
fail_if_manager_used,
lambda: FakeManager(),
)
count = registry.register_mcp_server(
@@ -176,6 +180,27 @@ def test_register_mcp_server_defaults_to_direct_client_behavior(monkeypatch):
assert count == 1
assert len(created_clients) == 1
def test_register_mcp_server_direct_client_when_manager_disabled(monkeypatch):
"""When use_connection_manager=False, a direct MCPClient is created."""
registry = ToolRegistry()
created_clients: list[_RegistryFakeClient] = []
def fake_client_factory(config):
client = _RegistryFakeClient(config)
created_clients.append(client)
return client
monkeypatch.setattr("framework.runner.mcp_client.MCPClient", fake_client_factory)
count = registry.register_mcp_server(
{"name": "direct", "transport": "stdio", "command": "echo"},
use_connection_manager=False,
)
assert count == 1
assert len(created_clients) == 1
assert created_clients[0].connect_calls == 1
registry.cleanup()
+290
View File
@@ -0,0 +1,290 @@
# Agent Skills User Guide
This guide covers how to use, create, and manage Agent Skills in the Hive framework. Agent Skills follow the open [Agent Skills standard](https://agentskills.io) — skills written for Claude Code, Cursor, or other compatible agents work in Hive unchanged.
## What are skills?
Skills are folders containing a `SKILL.md` file that teaches an agent how to perform a specific task. They can also bundle scripts, templates, and reference materials. Skills are loaded on demand — the agent sees a lightweight catalog at startup and pulls in full instructions only when relevant.
## Quick start
### Install a skill
Drop a skill folder into one of the discovery directories:
```bash
# Project-level (shared with the repo)
mkdir -p .hive/skills/my-skill
cat > .hive/skills/my-skill/SKILL.md << 'EOF'
---
name: my-skill
description: Does X when the user asks about Y.
---
# My Skill
Step-by-step instructions for the agent...
EOF
```
The agent will discover it automatically on the next session.
### List discovered skills
```bash
hive skill list
```
Output groups skills by scope:
```
PROJECT SKILLS
────────────────────────────────────
• my-skill
Does X when the user asks about Y.
/home/user/project/.hive/skills/my-skill/SKILL.md
USER SKILLS
────────────────────────────────────
• deep-research
Multi-step web research with source verification.
/home/user/.hive/skills/deep-research/SKILL.md
```
## Where to put skills
Hive scans five directories at startup, in this precedence order:
| Scope | Path | Use case |
|-------|------|----------|
| Project (Hive) | `<project>/.hive/skills/` | Skills specific to this repo |
| Project (cross-client) | `<project>/.agents/skills/` | Skills shared across Claude Code, Cursor, etc. |
| User (Hive) | `~/.hive/skills/` | Personal skills available in all projects |
| User (cross-client) | `~/.agents/skills/` | Personal cross-client skills |
| Framework | *(built-in)* | Default operational skills shipped with Hive |
**Precedence**: If two skills share the same name, the higher-precedence location wins. A project-level `code-review` skill overrides a user-level one with the same name.
**Cross-client paths**: The `.agents/skills/` directories are a convention shared across compatible agents. A skill installed at `~/.agents/skills/pdf-processing/` is visible to Hive, Claude Code, Cursor, and other compatible tools simultaneously.
## Creating a skill
### Directory structure
```
my-skill/
├── SKILL.md # Required — metadata + instructions
├── scripts/ # Optional — executable code
│ └── run.py
├── references/ # Optional — supplementary docs
│ └── api-reference.md
└── assets/ # Optional — templates, data files
└── template.json
```
### SKILL.md format
Every skill needs a `SKILL.md` with YAML frontmatter and a markdown body:
```markdown
---
name: my-skill
description: Extract and summarize PDF documents. Use when the user mentions PDFs or document extraction.
---
# PDF Processing
## When to use
Use this skill when the user needs to extract text from PDFs or merge documents.
## Steps
1. Check if pdfplumber is available...
2. Extract text using...
## Edge cases
- Scanned PDFs need OCR first...
```
### Frontmatter fields
| Field | Required | Description |
|-------|----------|-------------|
| `name` | Yes | Lowercase letters, numbers, hyphens. Must match the parent directory name. Max 64 chars. |
| `description` | Yes | What the skill does and when to use it. Max 1024 chars. Include keywords that help the agent match tasks. |
| `license` | No | License name or reference to a bundled LICENSE file. |
| `compatibility` | No | Environment requirements (e.g., "Requires git, docker"). |
| `metadata` | No | Arbitrary key-value pairs (author, version, etc.). |
| `allowed-tools` | No | Space-delimited list of pre-approved tools. |
### Writing good descriptions
The description is critical — it's what the agent uses to decide whether to activate a skill. Be specific:
```yaml
# Good — tells the agent what and when
description: Extract text and tables from PDF files, fill PDF forms, and merge multiple PDFs. Use when working with PDF documents or when the user mentions PDFs, forms, or document extraction.
# Bad — too vague for the agent to match
description: Helps with PDFs.
```
### Writing good instructions
The markdown body is loaded into the agent's context when the skill is activated. Tips:
- **Be procedural**: Step-by-step instructions work better than abstract descriptions.
- **Keep it focused**: Stay under 500 lines / 5000 tokens. Move detailed reference material to `references/`.
- **Use relative paths**: Reference bundled files with relative paths (`scripts/run.py`, `references/guide.md`).
- **Include examples**: Show sample inputs and expected outputs.
- **Cover edge cases**: Tell the agent what to do when things go wrong.
## How skills are activated
Skills use **progressive disclosure** — three tiers that keep context usage efficient:
### Tier 1: Catalog (always loaded)
At session start, the agent sees a compact catalog of all available skills (name + description only, ~50-100 tokens each). This is how it knows what skills exist.
### Tier 2: Instructions (on demand)
When the agent determines a skill is relevant to the current task, it reads the full `SKILL.md` body into context. This happens automatically — the agent matches the task against skill descriptions and activates the best fit.
### Tier 3: Resources (on demand)
When skill instructions reference supporting files (`scripts/extract.py`, `references/api-docs.md`), the agent reads those individually as needed.
### Pre-activated skills
Some agents are configured to load specific skills at session start (skipping the catalog phase). This is set in the agent's configuration:
```python
# In agent definition
skills = ["code-review", "deep-research"]
```
Pre-activated skills have their full instructions loaded from the start, without waiting for the agent to decide they're relevant.
## Trust and security
### Why trust gating exists
Project-level skills come from the repository being worked on. If you clone an untrusted repo that contains a `.hive/skills/` directory, those skills could inject instructions into the agent's system prompt. Trust gating prevents this.
**User-level and framework skills are always trusted.** Only project-scope skills go through trust gating.
### What happens with untrusted project skills
When Hive encounters project-level skills from a repo you haven't trusted before, it shows a consent prompt:
```
============================================================
SKILL TRUST REQUIRED
============================================================
The project at /home/user/new-project wants to load 2 skill(s)
that will inject instructions into the agent's system prompt.
Source: github.com/org/new-project
Skills requesting access:
• deploy-pipeline
"Automated deployment workflow for this project."
/home/user/new-project/.hive/skills/deploy-pipeline/SKILL.md
• code-standards
"Project-specific coding standards and review checklist."
/home/user/new-project/.hive/skills/code-standards/SKILL.md
Options:
1) Trust this session only
2) Trust permanently — remember for future runs
3) Deny — skip all project-scope skills from this repo
────────────────────────────────────────────────────────────
Select option (1-3):
```
### Trust a repo via CLI
To trust a repo permanently without the interactive prompt:
```bash
hive skill trust /path/to/project
```
This stores the trust decision in `~/.hive/trusted_repos.json`, keyed by the normalized git remote URL (e.g., `github.com/org/repo`).
### Automatic trust
Some repos are trusted automatically:
- **No git repo**: Directories without `.git/` are always trusted.
- **No remote**: Local-only git repos (no `origin` remote) are always trusted.
- **Localhost remotes**: Repos with `localhost`/`127.0.0.1` remotes are always trusted.
- **Own-remote patterns**: Repos matching patterns in `~/.hive/own_remotes` or the `HIVE_OWN_REMOTES` env var are always trusted.
### Configure own-remote patterns
If you trust all repos from your organization:
```bash
# Via file (one pattern per line)
echo "github.com/my-org/*" >> ~/.hive/own_remotes
echo "gitlab.com/my-team/*" >> ~/.hive/own_remotes
# Via environment variable (comma-separated)
export HIVE_OWN_REMOTES="github.com/my-org/*,github.com/my-corp/*"
```
### CI / headless environments
In non-interactive environments, untrusted project skills are silently skipped. To trust them explicitly:
```bash
export HIVE_TRUST_PROJECT_SKILLS=1
hive run my-agent
```
## Default skills
Hive ships with six built-in operational skills that provide runtime resilience. These are always loaded (unless disabled) and appear as "Operational Protocols" in the agent's system prompt.
| Skill | Purpose |
|-------|---------|
| `hive.note-taking` | Structured working notes in shared memory |
| `hive.batch-ledger` | Track per-item status in batch operations |
| `hive.context-preservation` | Save context before context window pruning |
| `hive.quality-monitor` | Self-assess output quality periodically |
| `hive.error-recovery` | Structured error classification and recovery |
| `hive.task-decomposition` | Break complex tasks into subtasks |
### Disable default skills
In your agent configuration:
```python
# Disable a specific default skill
default_skills = {
"hive.quality-monitor": {"enabled": False},
}
# Disable all default skills
default_skills = {
"_all": {"enabled": False},
}
```
## Environment variables
| Variable | Description |
|----------|-------------|
| `HIVE_TRUST_PROJECT_SKILLS=1` | Bypass trust gating for all project-level skills (CI override) |
| `HIVE_OWN_REMOTES` | Comma-separated glob patterns for auto-trusted remotes (e.g., `github.com/myorg/*`) |
## Compatibility with other agents
Skills written for any Agent Skills-compatible agent work in Hive:
- Place them in `.agents/skills/` (cross-client) or `.hive/skills/` (Hive-specific).
- The `SKILL.md` format is identical across Claude Code, Cursor, Gemini CLI, and others.
- Skills installed at `~/.agents/skills/` are visible to all compatible agents on your machine.
See the [Agent Skills specification](https://agentskills.io/specification) for the full format reference.
+11 -60
View File
@@ -1908,69 +1908,20 @@ if ($CodexAvailable) {
Write-Host ""
}
# Setup-only mode: show manual instructions
# Final instructions and auto-launch
Write-Host "API keys saved as User environment variables. New terminals pick them up automatically." -ForegroundColor DarkGray
Write-Host "Launch anytime with " -NoNewline -ForegroundColor DarkGray
Write-Color -Text "hive open" -Color Cyan -NoNewline
Write-Host ". Run .\quickstart.ps1 again to reconfigure." -ForegroundColor DarkGray
Write-Host ""
if ($FrontendBuilt) {
Write-Color -Text "═══════════════════════════════════════════════════════" -Color Yellow
Write-Host ""
Write-Color -Text " IMPORTANT: Restart your terminal now!" -Color Yellow
Write-Host ""
Write-Color -Text "═══════════════════════════════════════════════════════" -Color Yellow
Write-Host ""
Write-Host 'Environment variables (uv, API keys) are now configured, but you need to'
Write-Host 'restart your terminal for them to take effect in new sessions.'
Write-Host ""
Write-Color -Text "Run an Agent:" -Color White
Write-Host ""
Write-Host " Quickstart only sets things up. Launch the dashboard when you're ready:"
Write-Color -Text " hive open" -Color Cyan
Write-Host ""
if ($SelectedProviderId -or $credKey) {
Write-Color -Text "Note:" -Color White
Write-Host "- uv has been added to your User PATH"
if ($SelectedProviderId -and $SelectedEnvVar) {
Write-Host "- $SelectedEnvVar is set for LLM access"
}
if ($credKey) {
Write-Host "- HIVE_CREDENTIAL_KEY is set for credential encryption"
}
Write-Host "- All variables will persist across reboots"
Write-Host ""
}
Write-Color -Text 'Run .\quickstart.ps1 again to reconfigure.' -Color DarkGray
Write-Color -Text "Launching dashboard..." -Color White
Write-Host ""
& hive open
} else {
Write-Color -Text "═══════════════════════════════════════════════════════" -Color Yellow
Write-Host ""
Write-Color -Text " IMPORTANT: Restart your terminal now!" -Color Yellow
Write-Host ""
Write-Color -Text "═══════════════════════════════════════════════════════" -Color Yellow
Write-Host ""
Write-Host 'Environment variables (uv, API keys) are now configured, but you need to'
Write-Host 'restart your terminal for them to take effect in new sessions.'
Write-Host ""
Write-Color -Text "Run an Agent:" -Color White
Write-Host ""
Write-Host " Frontend build was skipped or failed. Once the dashboard is available, launch it with:"
Write-Color -Text "Frontend build was skipped or failed." -Color Yellow -NoNewline
Write-Host " Launch manually when ready:"
Write-Color -Text " hive open" -Color Cyan
Write-Host ""
if ($SelectedProviderId -or $credKey) {
Write-Color -Text "Note:" -Color White
Write-Host "- uv has been added to your User PATH"
if ($SelectedProviderId -and $SelectedEnvVar) {
Write-Host "- $SelectedEnvVar is set for LLM access"
}
if ($credKey) {
Write-Host "- HIVE_CREDENTIAL_KEY is set for credential encryption"
}
Write-Host "- All variables will persist across reboots"
Write-Host ""
}
Write-Color -Text 'Run .\quickstart.ps1 again to reconfigure.' -Color DarkGray
Write-Host ""
}
+8 -21
View File
@@ -1810,29 +1810,16 @@ if [ "$CODEX_AVAILABLE" = true ]; then
echo ""
fi
echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${BOLD}IMPORTANT: Load your new configuration${NC}"
echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo ""
echo -e " Your API keys have been saved to ${CYAN}$SHELL_RC_FILE${NC}"
echo -e " To use them, either:"
echo ""
echo -e " ${GREEN}Option 1:${NC} Source your shell config now:"
echo -e " ${CYAN}source $SHELL_RC_FILE${NC}"
echo ""
echo -e " ${GREEN}Option 2:${NC} Open a new terminal window"
echo ""
echo -e "${YELLOW}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${DIM}API keys saved to ${CYAN}$SHELL_RC_FILE${NC}${DIM}. New terminals pick them up automatically.${NC}"
echo -e "${DIM}Launch anytime with ${CYAN}hive open${NC}${DIM}. Run ./quickstart.sh again to reconfigure.${NC}"
echo ""
echo -e "${BOLD}Run an Agent:${NC}"
echo ""
if [ "$FRONTEND_BUILT" = true ]; then
echo -e " Quickstart only sets things up. Launch the dashboard when you're ready:"
echo -e "${BOLD}Launching dashboard...${NC}"
echo ""
hive open
else
echo -e " Frontend build was skipped or failed. Once the dashboard is available, launch it with:"
echo -e "${YELLOW}Frontend build was skipped or failed.${NC} Launch manually when ready:"
echo -e " ${CYAN}hive open${NC}"
echo ""
fi
echo -e " ${CYAN}hive open${NC}"
echo ""
echo -e "${DIM}Run ./quickstart.sh again to reconfigure.${NC}"
echo ""
+127 -40
View File
@@ -1,17 +1,21 @@
#!/usr/bin/env python3
"""Open a browser-based viewer for Hive LLM debug JSONL sessions.
Starts a local HTTP server and loads session data on demand (one at a time).
Usage:
uv run --no-project scripts/llm_debug_log_visualizer.py
uv run --no-project scripts/llm_debug_log_visualizer.py --no-open
uv run --no-project scripts/llm_debug_log_visualizer.py --session <execution_id>
uv run --no-project scripts/llm_debug_log_visualizer.py --port 8080
uv run --no-project scripts/llm_debug_log_visualizer.py --output debug.html
"""
from __future__ import annotations
import argparse
import http.server
import json
import tempfile
import urllib.parse
import webbrowser
from collections import defaultdict
from dataclasses import dataclass
@@ -55,10 +59,21 @@ def _parse_args() -> argparse.Namespace:
default=200,
help="Maximum number of newest log files to scan.",
)
parser.add_argument(
"--port",
type=int,
default=0,
help="Port for the local server (0 = auto-pick a free port).",
)
parser.add_argument(
"--no-open",
action="store_true",
help="Generate the HTML but do not open a browser.",
help="Start the server but do not open a browser.",
)
parser.add_argument(
"--include-tests",
action="store_true",
help="Show test/mock sessions (hidden by default).",
)
return parser.parse_args()
@@ -117,8 +132,29 @@ def _format_timestamp(raw: str) -> str:
return raw
def _is_test_session(execution_id: str, records: list[dict[str, Any]]) -> bool:
"""Return True for sessions that look like test artifacts."""
if execution_id.startswith("<MagicMock"):
return True
models = {
str(r.get("token_counts", {}).get("model", ""))
for r in records
if isinstance(r.get("token_counts"), dict)
}
models.discard("")
# Sessions that only used the mock LLM provider.
if models and models <= {"mock"}:
return True
# Sessions with no real model at all (empty string or missing).
if not models:
return True
return False
def _group_sessions(
records: list[dict[str, Any]],
*,
include_tests: bool = False,
) -> tuple[list[SessionSummary], dict[str, list[dict[str, Any]]]]:
by_session: dict[str, list[dict[str, Any]]] = defaultdict(list)
for record in records:
@@ -126,6 +162,13 @@ def _group_sessions(
if execution_id:
by_session[execution_id].append(record)
if not include_tests:
by_session = {
eid: recs
for eid, recs in by_session.items()
if not _is_test_session(eid, recs)
}
summaries: list[SessionSummary] = []
for execution_id, session_records in by_session.items():
session_records.sort(
@@ -174,7 +217,6 @@ def _group_sessions(
def _render_html(
summaries: list[SessionSummary],
sessions: dict[str, list[dict[str, Any]]],
initial_session_id: str,
) -> str:
summaries_data = [
@@ -193,16 +235,6 @@ def _render_html(
for summary in summaries
]
sessions_data = {
execution_id: sorted(
records,
key=lambda record: (
str(record.get("timestamp", "")),
record.get("iteration", 0),
),
)
for execution_id, records in sessions.items()
}
initial = initial_session_id or (summaries[0].execution_id if summaries else "")
return f"""<!DOCTYPE html>
<html lang="en">
@@ -579,10 +611,9 @@ def _render_html(
</div>
<script id="session-summaries" type="application/json">{json.dumps(summaries_data, ensure_ascii=False)}</script>
<script id="session-records" type="application/json">{json.dumps(sessions_data, ensure_ascii=False)}</script>
<script>
const summaries = JSON.parse(document.getElementById("session-summaries").textContent);
const recordsBySession = JSON.parse(document.getElementById("session-records").textContent);
const recordCache = {{}};
const initialSessionId = {json.dumps(initial, ensure_ascii=False)};
const sessionSearch = document.getElementById("sessionSearch");
@@ -746,10 +777,18 @@ def _render_html(
`;
}}
function renderSession(sessionId) {{
async function fetchSession(sessionId) {{
if (recordCache[sessionId]) return recordCache[sessionId];
const resp = await fetch(`/api/session/${{encodeURIComponent(sessionId)}}`);
if (!resp.ok) return [];
const data = await resp.json();
recordCache[sessionId] = data;
return data;
}}
async function renderSession(sessionId) {{
activeSessionId = sessionId;
const summary = summaries.find((entry) => entry.execution_id === sessionId);
const records = recordsBySession[sessionId] || [];
renderSessionChooser();
@@ -773,6 +812,9 @@ def _render_html(
renderMetaCard("Source file", summary.log_file),
].join("");
turnsEl.innerHTML = '<div class="empty">Loading session\u2026</div>';
const records = await fetchSession(sessionId);
if (activeSessionId !== sessionId) return;
turnsEl.innerHTML = records.length
? records.map((record) => renderTurn(record)).join("")
: '<div class="empty">This session has no turn records.</div>';
@@ -804,7 +846,8 @@ def _render_html(
}});
const hashSession = decodeURIComponent(window.location.hash.replace(/^#/, ""));
const bootSession = recordsBySession[hashSession] ? hashSession : activeSessionId;
const knownIds = new Set(summaries.map((s) => s.execution_id));
const bootSession = knownIds.has(hashSession) ? hashSession : activeSessionId;
renderSessionChooser();
renderSession(bootSession);
</script>
@@ -813,28 +856,70 @@ def _render_html(
"""
def _write_report(html_report: str, output: Path | None) -> Path:
if output is not None:
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(html_report, encoding="utf-8")
return output
def _sort_records(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
return sorted(
records,
key=lambda r: (str(r.get("timestamp", "")), r.get("iteration", 0)),
)
with tempfile.NamedTemporaryFile(
mode="w",
encoding="utf-8",
prefix="hive_llm_debug_",
suffix=".html",
delete=False,
dir="/tmp",
) as handle:
handle.write(html_report)
return Path(handle.name)
def _run_server(
html: str,
sessions: dict[str, list[dict[str, Any]]],
port: int,
no_open: bool,
) -> None:
html_bytes = html.encode("utf-8")
class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self) -> None:
if self.path == "/":
self._respond(200, "text/html; charset=utf-8", html_bytes)
elif self.path.startswith("/api/session/"):
sid = urllib.parse.unquote(self.path[len("/api/session/"):])
records = sessions.get(sid)
if records is None:
self._respond(404, "application/json", b"[]")
else:
body = json.dumps(
_sort_records(records), ensure_ascii=False
).encode("utf-8")
self._respond(200, "application/json", body)
else:
self.send_error(404)
def _respond(self, code: int, content_type: str, body: bytes) -> None:
self.send_response(code)
self.send_header("Content-Type", content_type)
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, format: str, *args: object) -> None:
pass # silence per-request logs
server = http.server.HTTPServer(("127.0.0.1", port), Handler)
actual_port = server.server_address[1]
url = f"http://127.0.0.1:{actual_port}"
print(f"Serving at {url} (Ctrl+C to stop)")
if not no_open:
webbrowser.open(url)
try:
server.serve_forever()
except KeyboardInterrupt:
print("\nStopped.")
finally:
server.server_close()
def main() -> int:
args = _parse_args()
records = _discover_records(args.logs_dir.expanduser(), args.limit_files)
summaries, sessions = _group_sessions(records)
summaries, sessions = _group_sessions(
records, include_tests=args.include_tests
)
initial_session_id = args.session or (
summaries[0].execution_id if summaries else ""
@@ -843,13 +928,15 @@ def main() -> int:
print(f"session not found: {initial_session_id}")
return 1
html_report = _render_html(summaries, sessions, initial_session_id)
output_path = _write_report(html_report, args.output)
print(output_path)
html_report = _render_html(summaries, initial_session_id)
if not args.no_open:
webbrowser.open(output_path.resolve().as_uri())
if args.output:
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(html_report, encoding="utf-8")
print(args.output)
return 0
_run_server(html_report, sessions, args.port, args.no_open)
return 0