fix: skills prompts

2026-04-14 18:51:14 -07:00
parent 50eb4b0e8f
commit 27b15789fb
4 changed files with 276 additions and 47 deletions
@@ -14,13 +14,37 @@ from framework.skills.skill_errors import SkillErrorCode, log_skill_error

 logger = logging.getLogger(__name__)

-_BEHAVIORAL_INSTRUCTION = (
-    "The following skills provide specialized instructions for specific tasks.\n"
-    "When a task matches a skill's description, read the SKILL.md at the listed\n"
-    "location to load the full instructions before proceeding.\n"
-    "When a skill references relative paths, resolve them against the skill's\n"
-    "directory (the parent of SKILL.md) and use absolute paths in tool calls."
-)
+# Upper bound on the raw `<available_skills>` XML body, in characters.
+# When the full catalog (with <description> entries) exceeds this, we fall
+# back to the compact variant that drops descriptions but keeps every skill
+# visible. Preserving awareness of every skill beats truncating entries.
+_COMPACT_THRESHOLD_CHARS = 5000
+
+_MANDATORY_HEADER_FULL = """## Skills (mandatory)
+Before replying: scan <available_skills> <description> entries.
+- If exactly one skill clearly applies: read its SKILL.md at <location> with `read_file`, then follow it.
+- If multiple could apply: choose the most specific one, then read/follow it.
+- If none clearly apply: do not read any SKILL.md.
+Constraints: never read more than one skill up front; only read after selecting.
+- When a skill drives external API writes (Gmail, Calendar, GitHub, etc.), assume rate limits: prefer fewer larger writes, avoid tight one-item loops, serialize bursts when possible, and respect 429/Retry-After.
+
+
+The following skills provide specialized instructions for specific tasks.
+Use `read_file` to load a skill's SKILL.md when the task matches its description.
+When a skill file references a relative path, resolve it against the skill directory (parent of SKILL.md) and use that absolute path in tool commands."""
+
+_MANDATORY_HEADER_COMPACT = """## Skills (mandatory)
+Before replying: scan <available_skills> <name> entries.
+- If exactly one skill clearly applies: read its SKILL.md at <location> with `read_file`, then follow it.
+- If multiple could apply: choose the most specific one, then read/follow it.
+- If none clearly apply: do not read any SKILL.md.
+Constraints: never read more than one skill up front; only read after selecting.
+- When a skill drives external API writes (Gmail, Calendar, GitHub, etc.), assume rate limits: prefer fewer larger writes, avoid tight one-item loops, serialize bursts when possible, and respect 429/Retry-After.
+
+
+The following skills provide specialized instructions for specific tasks.
+Use `read_file` to load a skill's SKILL.md when the task matches its name.
+When a skill file references a relative path, resolve it against the skill directory (parent of SKILL.md) and use that absolute path in tool commands."""


 class SkillCatalog:
@@ -61,27 +85,42 @@ class SkillCatalog:
    def to_prompt(self) -> str:
        """Generate the catalog prompt for system prompt injection.

-        Returns empty string if no community/user skills are discovered
-        (default skills are handled separately by DefaultSkillManager).
-        """
-        # All skills go through the catalog for progressive disclosure.
-        all_skills = list(self._skills.values())
+        Returns empty string when no skills are present. Otherwise returns
+        a mandatory pre-reply checklist + decision rules + rate-limit note,
+        followed by the <available_skills> XML body.

+        When the full XML body exceeds ``_COMPACT_THRESHOLD_CHARS``, the
+        compact variant is emitted instead: <description> elements are
+        dropped so every skill stays visible before any gets truncated.
+        """
+        all_skills = sorted(self._skills.values(), key=lambda s: s.name)
        if not all_skills:
            return ""

+        full_xml = self._render_xml(all_skills, compact=False)
+        if len(full_xml) <= _COMPACT_THRESHOLD_CHARS:
+            return f"{_MANDATORY_HEADER_FULL}\n\n{full_xml}"
+
+        compact_xml = self._render_xml(all_skills, compact=True)
+        return f"{_MANDATORY_HEADER_COMPACT}\n\n{compact_xml}"
+
+    @staticmethod
+    def _render_xml(skills: list[ParsedSkill], *, compact: bool) -> str:
+        """Render the `<available_skills>` block.
+
+        ``compact=True`` drops `<description>` to preserve skill awareness
+        when the catalog would otherwise blow the char budget.
+        """
        lines = ["<available_skills>"]
-        for skill in sorted(all_skills, key=lambda s: s.name):
+        for skill in skills:
            lines.append("  <skill>")
            lines.append(f"    <name>{escape(skill.name)}</name>")
+            if not compact:
                lines.append(f"    <description>{escape(skill.description)}</description>")
            lines.append(f"    <location>{escape(skill.location)}</location>")
-            lines.append(f"    <base_dir>{escape(skill.base_dir)}</base_dir>")
            lines.append("  </skill>")
        lines.append("</available_skills>")
-
-        xml_block = "\n".join(lines)
-        return f"{_BEHAVIORAL_INSTRUCTION}\n\n{xml_block}"
+        return "\n".join(lines)

    def build_pre_activated_prompt(self, skill_names: list[str]) -> str:
        """Build prompt content for pre-activated skills.
@@ -94,7 +94,10 @@ class TestSkillCatalog:
        assert "<name>beta</name>" in prompt
        assert "<description>Alpha skill</description>" in prompt
        assert "<location>/p/alpha/SKILL.md</location>" in prompt
-        assert "<base_dir>/p/alpha</base_dir>" in prompt
+        # <base_dir> is intentionally not emitted — the mandatory header
+        # tells the model to resolve relative paths against the parent of
+        # SKILL.md, so the redundant element was dropped.
+        assert "<base_dir>" not in prompt

    def test_to_prompt_sorted_by_name(self):
        skills = [
@@ -130,13 +133,44 @@ class TestSkillCatalog:
        assert "<name>usr</name>" in prompt
        assert "<name>fw</name>" in prompt

-    def test_to_prompt_contains_behavioral_instruction(self):
+    def test_to_prompt_contains_mandatory_header(self):
+        """The rendered catalog must carry the mandatory pre-reply checklist
+        so soft guidance turns into a required step."""
        catalog = SkillCatalog([_make_skill(source_scope="project")])
        prompt = catalog.to_prompt()

-        assert "When a task matches a skill's description" in prompt
+        assert "## Skills (mandatory)" in prompt
+        assert "Before replying: scan <available_skills>" in prompt
+        assert "never read more than one skill up front" in prompt
+        assert "`read_file`" in prompt
        assert "SKILL.md" in prompt

+    def test_to_prompt_compact_fallback_drops_descriptions(self):
+        """When the full XML body exceeds the char threshold, the compact
+        variant drops <description> but keeps every skill's <name>."""
+        # Each skill contributes ~100+ chars with a long description.
+        # 60 skills easily pushes the body past the threshold.
+        skills = [
+            _make_skill(
+                name=f"skill-{i:03d}",
+                description="A reasonably long description " * 4,
+                location=f"/s/skill-{i:03d}/SKILL.md",
+                base_dir=f"/s/skill-{i:03d}",
+            )
+            for i in range(60)
+        ]
+        catalog = SkillCatalog(skills)
+        prompt = catalog.to_prompt()
+
+        # Mandatory header still present but uses the compact variant wording.
+        assert "## Skills (mandatory)" in prompt
+        assert "scan <available_skills> <name>" in prompt
+        # Every skill's name survives …
+        for i in range(60):
+            assert f"<name>skill-{i:03d}</name>" in prompt
+        # … but no descriptions were rendered.
+        assert "<description>" not in prompt
+
    def test_build_pre_activated_prompt(self):
        skill = _make_skill("research", body="## Deep Research\nDo thorough research.")
        catalog = SkillCatalog([skill])
@@ -1,9 +1,14 @@
 """Tests for AS-6 skill resource loading support.

 Covers:
- <base_dir> element in catalog XML
 - allowlisted_dirs property reflects trusted skill base directories
 - skill_dirs propagation to NodeContext
+
+The catalog XML previously emitted a redundant <base_dir> element next to
+each <location>. That was dropped when the mandatory header took over the
+"resolve relative paths against the parent of SKILL.md" instruction, so
+there is no longer an XML-emission test for base_dir. Programmatic access
+via ``catalog.allowlisted_dirs`` is still covered below.
 """

 from framework.skills.catalog import SkillCatalog
@@ -26,31 +31,6 @@ def _make_skill(


 class TestSkillResourceBaseDir:
-    def test_base_dir_in_xml(self):
-        """Each community skill entry should expose its base_dir in the catalog XML."""
-        skill = _make_skill("deploy", "/project/.hive/skills/deploy")
-        catalog = SkillCatalog([skill])
-        prompt = catalog.to_prompt()
-
-        assert "<base_dir>/project/.hive/skills/deploy</base_dir>" in prompt
-
-    def test_base_dir_xml_escaped(self):
-        """base_dir with XML-special chars should be escaped."""
-        skill = _make_skill("s", "/path/with <&> chars")
-        catalog = SkillCatalog([skill])
-        prompt = catalog.to_prompt()
-
-        assert "<base_dir>/path/with &lt;&amp;&gt; chars</base_dir>" in prompt
-
-    def test_base_dir_present_for_framework_skills(self):
-        """Framework-scope skills now appear in the catalog like any other scope,
-        and their base_dir is included in the XML."""
-        skill = _make_skill("fw", "/hive/_default_skills/fw", source_scope="framework")
-        catalog = SkillCatalog([skill])
-        prompt = catalog.to_prompt()
-        assert "<name>fw</name>" in prompt
-        assert "<base_dir>/hive/_default_skills/fw</base_dir>" in prompt
-
    def test_allowlisted_dirs_matches_skills(self):
        """allowlisted_dirs returns all skill base_dirs including framework ones."""
        skills = [
@@ -0,0 +1,176 @@
+# 🐝 Hive Agent v0.10.0: The Colony
+
+> ⚠️ **Breaking change.** This is a large architectural refactor of how agents work in Hive. **Old agents are no longer compatible.** Existing workspaces, custom agents, and saved sessions from pre-v0.10.0 builds will need to be recreated.
+
+---
+
+## ✨ Highlights
+
+The **Colony** introduces a new way of working: a group of specialized workers operating together to run and scale your business.
+
+The role of the **Queen** has evolved. Instead of only orchestrating, the Queen now **executes work first** to deliver immediate value, then **builds systems around that work** to create stable, repeatable business processes.
+
+You now have a full leadership team of eight Queens, each with their own identity, expertise, and voice:
+
+| Queen | Role |
+| --- | --- |
+| **Sophia** | Head of Brand & Design |
+| **Charlotte** | Head of Finance & Fundraising |
+| **Victoria** | Head of Growth |
+| **Eleanor** | Head of Legal |
+| **Rachel** | Head of Operations |
+| **Isabella** | Head of Product Strategy |
+| **Amelia** | Head of Talent |
+| **Alexandra** | Head of Technology |
+
+Start automating your business processes with your Queens today.
+
+---
+
+## 🏛️ The Colony Architecture
+
+### Queens as Identities, Not Just Orchestrators
+
+- **Queen profiles** — each queen is a YAML-backed persona (`~/.hive/agents/queens/{queen_id}/profile.yaml`) with core traits, hidden background, psychological profile, behavior triggers, and skill sets. Profiles are injected into the system prompt at session start.
+- **CEO-style queen selection** — an LLM classifier routes every new user request to the best-matching queen based on the task at hand, with structured routing diagnostics (`QueenSelection`).
+- **Queen DMs** — direct-message pages for each queen with a dedicated session flow, session switcher, and prompt library integration.
+- **Independent / PM mode** — queens run in an independent mode for planning-phase work, with a "think out loud" internal monologue surfaced through internal tags.
+- **Queen memory v2** — simplified memory implementation with reflection agent, cooldown-gated reflections, user identity, doppelganger wiring, and recall-selector for targeted retrieval.
+- **Queen lifecycle tools** — first-class tools for escalation, queen reply, and session handoff.
+
+### Colony Runtime
+
+- **Grand architecture revamp** — the framework, agent loop, runtime, graph, pipeline, executor, and node worker layers have been rewritten from the ground up. Deprecated shims and legacy orchestration paths have been removed.
+- **Colony creation flow** — colonies are created via skill, with reliable event bus subscription, worker spawning, and post-creation list refresh.
+- **Scheduled triggers** — colonies can now be woken on a cron schedule, with triggers firing directly into the owning queen's session.
+- **Simple fork** for agents, stable credential states, and improved worker execution reliability.
+
+---
+
+## 🆕 What's New
+
+### Colony & Queens
+
+- 8 default queen personas (Alexandra, Victoria, Isabella, Charlotte, Eleanor, Sophia, Amelia, Rachel) with profile YAML, examples, and behavior triggers
+- LLM-based queen selector with reasoning output
+- Queen DM page, queen session switcher, and sidebar queen item
+- Queen scope memory, role examples, and identity loading
+- Reflection agent with cooldown and improved reflection runner
+- Queen orchestrator + `routes_queens` API
+- Natural chat replies and cleaner home-prompt bootstrap
+- Queen identity for new sessions
+- `ask_user` / `ask_user_multiple` tools available in queen prompt
+- Escalation and queen-reply tools
+
+### Skills & Tools
+
+- **Learned default skills** — skills the queen has learned become part of her baseline
+- **Tool-gated skill activation** — skills only activate when their required tools are present
+- **Skills for colonies** — per-colony skill registration and loading
+- **Text-only model filter** — image-producing tools and vision-only prompt blocks are hidden from text-only models
+- **Browser skills upgrade** — improved click reliability, screenshot capture, and credential filtering
+- **Deprecated-tool removal** and alignment of Hive tool names across the codebase
+- **Ask-user widget** with fallback rendering and preserved tool pill mapping across turn boundaries for deferred completions
+- **Improved tool-call reliability** across the board (tool limit removed, tool blacklist, tool credential filter)
+- **MCP** — efficient MCP loading at initialization, default MCP bootstrapping, registered available MCP tools, fixed MCP tool initialization and registry pipeline stage
+
+### LLM & Credentials
+
+- **Key pool** for credential management with stable credential states
+- **Aden credentials storage adapter** and subscription-based LLM config activation endpoint
+- **Consolidated model config** with unified model catalog
+- **New providers** — Kimi, Hive, and Aden added to the model catalog
+- **Model switcher** UI with runtime model switching API
+- **LLM key validation endpoint** with agent errors surfaced via SSE
+- **BYOK modal** import fixes for subscription token detection
+
+### Frontend
+
+- **Home redesign** — new home, credentials, and org chart pages
+- **Colony chat** and **queen DM** pages
+- **Sidebar + header** components and global app layout/routing
+- **Model switcher, settings modal, template card**
+- **Prompt library** with search, category filtering, and UI polish
+- **Side panel** fixes and sub-agent pane light-mode support
+- **Flowchart** light-mode support and normalized settings modal sizing
+- **User profile settings** and UI enhancements
+- **Sync user profile** to global memory as `user-profile.md`; queen profile API transformation
+- Removed the old workspace GUI and its dependencies
+
+### Framework & Runtime
+
+- Architecture revamp: new runtime config, simplified agent loading, new infra for queen
+- Home hive directory structure refactor
+- Agent loading pipeline fixes, MCP registry pipeline stage fix
+- Session resume improvements: separate resume vs new-session flow for queen sessions, edge-case fix for message injection in resumed sessions
+- Strip internal tags from user-visible output
+- Colony event bus subscription fixes and shared event bus for parent visibility
+- Worker spawn and stop-worker fixes
+- Default log level and extra logging hooks
+
+---
+
+## 🐛 Bug Fixes
+
+- **Ask-user widget** — fallback when widget fails to mount
+- **Skill loading** for colonies and proper skill resolution across queen sessions
+- **Model switching** and new-chat flow no longer carry stale state
+- **Tool pill mapping** preserved across turn boundary for deferred `ask_user` completions
+- **Tool limit** removed (was capping legitimate long tool lists)
+- **Queen loading** stability fixes
+- **Side panel** rendering issues
+- **Deprecated graphs** removed from UI
+- **Home-page prompts** now reach the queen directly without waiting for the greeting to finish
+- **Colony creation** link, reframing, and post-creation refresh
+- **Build error** in colony creation path
+- **GCU system prompt** tuning
+- **Tool credential filter** correctness
+- **Screenshot** capture and browser click reliability
+- **Queen message injection** when resuming a session
+- **Internal-tag diction** fixes in surfaced output
+- **MCP tool initialization** on cold start
+- **Frontend DM** edge cases
+- **Prompt library** new-session handling for new chat
+- **Config validation** and unavailable Minimax model handling
+- **Queen identity** loading on cold boot
+- **Extra text** in queen selector JSON response parsed safely
+- **Outdated queen communication prompt** removed
+
+---
+
+## 🧹 Refactor & Cleanup
+
+- **Shatter the Eld\*n ring** — top-to-bottom refactor of the runtime core
+- **Grand clean-up** of deprecated code paths
+- **Remove deprecated shims** and old session-status tools
+- **Big test cleanup** — integration tests and component tests rewritten around the new architecture
+- **Update references** for orchestrator / host / loader renames
+- **Consolidate tests** for queen state machine and verified outcomes
+- **Remove old workspace GUI** and its dependencies
+- **Remove old "new agent" button** and deprecated entry points
+- **Home hive directory** structure refactor
+
+---
+
+## ⚠️ Breaking Changes
+
+- **Old agents are not compatible.** Custom agents authored against the pre-v0.10.0 framework will need to be re-authored against the new Queen/Colony runtime.
+- **Session format** — pre-v0.10.0 sessions cannot be resumed.
+- **Deprecated tools removed** and Hive tool names have been realigned; any external scripts referencing old tool names must be updated.
+- **Old session-status tools** removed in favor of the new queen lifecycle tools.
+- **Workspace GUI removed** — the legacy workspace UI is gone; use the new home, colony chat, and queen DM pages.
+- **MCP registry pipeline** — MCP configurations now load through the new registry; custom MCP setups may need to be re-registered.
+
+---
+
+## 🚀 Upgrading
+
+Because this release rewrites the agent runtime, the recommended upgrade path is:
+
+1. Back up `~/.hive/` if you have sessions or custom agents you want to reference.
+2. Pull `main` at the v0.10.0 tag.
+3. Let Hive initialize the new queen profiles under `~/.hive/agents/queens/`.
+4. Re-create any custom agents as colonies/queens against the new framework.
+5. Re-register any custom MCP servers through the new MCP registry.
+
+Welcome to the Colony. 🐝