fix: incubating mode approval guidence injection

This commit is contained in:
Richard Tang
2026-04-29 18:43:26 -07:00
parent 6357597e88
commit d26e7f33d2
3 changed files with 39 additions and 79 deletions
@@ -1,9 +1,8 @@
"""One-shot LLM gate that decides if a queen DM is ready to fork a colony. """One-shot LLM gate that decides if a queen DM is ready to fork a colony.
The queen's ``start_incubating_colony`` tool calls :func:`evaluate` with The queen's ``start_incubating_colony`` tool calls :func:`evaluate` with
the queen's recent conversation, a proposed ``colony_name``, and a the queen's recent conversation and a proposed ``colony_name``. The
one-paragraph ``intended_purpose``. The evaluator returns a structured evaluator returns a structured verdict:
verdict:
{ {
"ready": bool, "ready": bool,
@@ -38,8 +37,8 @@ You gate whether a queen agent should commit to forking a persistent
expensive: it ends the user's chat with this queen and the worker runs expensive: it ends the user's chat with this queen and the worker runs
unattended afterward, so the spec must be settled before you approve. unattended afterward, so the spec must be settled before you approve.
Read the conversation excerpt and the queen's proposed colony_name + Read the conversation excerpt and the queen's proposed colony_name,
intended_purpose, then decide. then decide.
APPROVE (ready=true) only when ALL of the following hold: APPROVE (ready=true) only when ALL of the following hold:
1. The user has explicitly asked for work that needs to outlive this 1. The user has explicitly asked for work that needs to outlive this
@@ -128,11 +127,9 @@ def format_conversation_excerpt(messages: list[Message]) -> str:
def _build_user_message( def _build_user_message(
conversation_excerpt: str, conversation_excerpt: str,
colony_name: str, colony_name: str,
intended_purpose: str,
) -> str: ) -> str:
return ( return (
f"## Proposed colony name\n{colony_name}\n\n" f"## Proposed colony name\n{colony_name}\n\n"
f"## Queen's intended_purpose\n{intended_purpose.strip()}\n\n"
f"## Recent conversation (oldest → newest)\n{conversation_excerpt}\n\n" f"## Recent conversation (oldest → newest)\n{conversation_excerpt}\n\n"
"Decide: should this queen be approved to enter INCUBATING phase?" "Decide: should this queen be approved to enter INCUBATING phase?"
) )
@@ -189,7 +186,6 @@ async def evaluate(
llm: Any, llm: Any,
messages: list[Message], messages: list[Message],
colony_name: str, colony_name: str,
intended_purpose: str,
) -> dict[str, Any]: ) -> dict[str, Any]:
"""Run the incubating evaluator against the queen's conversation. """Run the incubating evaluator against the queen's conversation.
@@ -200,14 +196,13 @@ async def evaluate(
messages: The queen's conversation messages, oldest first. The messages: The queen's conversation messages, oldest first. The
evaluator slices its own tail; pass the full list. evaluator slices its own tail; pass the full list.
colony_name: Validated colony slug. colony_name: Validated colony slug.
intended_purpose: Queen's one-paragraph brief.
Returns: Returns:
``{"ready": bool, "reasons": [str], "missing_prerequisites": [str]}``. ``{"ready": bool, "reasons": [str], "missing_prerequisites": [str]}``.
Fail-closed on any error. Fail-closed on any error.
""" """
excerpt = format_conversation_excerpt(messages) excerpt = format_conversation_excerpt(messages)
user_msg = _build_user_message(excerpt, colony_name, intended_purpose) user_msg = _build_user_message(excerpt, colony_name)
try: try:
response = await llm.acomplete( response = await llm.acomplete(
+14 -15
View File
@@ -144,13 +144,12 @@ several entries when you have multiple clarifications. \
When the user clearly wants persistent / recurring / headless work that \ When the user clearly wants persistent / recurring / headless work that \
needs to outlive THIS chat (e.g. "every morning", "monitor X and alert \ needs to outlive THIS chat (e.g. "every morning", "monitor X and alert \
me", "set up a job that"), call ``start_incubating_colony`` with a \ me", "set up a job that"), call ``start_incubating_colony`` with a \
proposed colony_name and a one-paragraph intended_purpose. A side \ proposed colony_name. A side evaluator reads the conversation and \
evaluator reads the conversation and decides if the spec is settled. If \ decides if the spec is settled. If it returns ``not_ready`` you keep \
it returns ``not_ready`` you keep talking with the user sort out \ talking with the user sort out whatever the evaluator said is \
whatever the evaluator said is missing, then retry. If it returns \ missing, then retry. If it returns ``incubating`` your phase flips and \
``incubating`` your phase flips and a new prompt takes over. Do not \ a new prompt takes over. Do not try to write SKILL.md, fork \
try to write SKILL.md, fork directories, or otherwise build the colony \ directories, or otherwise build the colony yourself in this phase.\
yourself in this phase.\
""" """
_queen_role_incubating = """\ _queen_role_incubating = """\
@@ -260,14 +259,14 @@ search_files, run_command, undo_changes
- MUST Follow the browser-automation skill protocol before using browser tools. - MUST Follow the browser-automation skill protocol before using browser tools.
## Hand off to a colony ## Hand off to a colony
- start_incubating_colony(colony_name, intended_purpose) Use this when \ - start_incubating_colony(colony_name) Use this when the user wants \
the user wants persistent / recurring / headless work that needs to \ persistent / recurring / headless work that needs to outlive THIS \
outlive THIS chat. It does NOT fork on its own; it spawns a one-shot \ chat. It does NOT fork on its own; it spawns a one-shot evaluator \
evaluator that reads this conversation and decides whether the spec \ that reads this conversation and decides whether the spec is settled \
is settled enough to proceed. On approval your phase flips to \ enough to proceed. On approval your phase flips to INCUBATING and a \
INCUBATING and a new tool surface (including create_colony itself) \ new tool surface (including create_colony itself) unlocks. On \
unlocks. On rejection you stay here and keep the conversation going \ rejection you stay here and keep the conversation going to fill the \
to fill the gaps the evaluator named. gaps the evaluator named.
""" """
_queen_tools_incubating = """ _queen_tools_incubating = """
+20 -54
View File
@@ -69,15 +69,17 @@ logger = logging.getLogger(__name__)
# Phrasing intentionally invites the queen's judgement; do NOT turn this # Phrasing intentionally invites the queen's judgement; do NOT turn this
# into a hard checklist. # into a hard checklist.
_INCUBATING_APPROVAL_GUIDANCE = ( _INCUBATING_APPROVAL_GUIDANCE = (
"Approved to incubate colony '{colony_name}' for: {intended_purpose}\n\n" "Approved to incubate colony '{colony_name}'.\n\n"
"Your phase has flipped to INCUBATING. Before you call create_colony, " "Your phase has flipped to INCUBATING. Before you call create_colony, "
"the worker will need operational details that are easy to lose in a " "you'll need operational details that are easy to lose in a "
"planning conversation. Take a moment to figure out what's still " "planning conversation. Take a moment to figure out what's still "
"ambiguous for THIS colony — for example: how many tasks should run " "ambiguous for THIS colony — for example: how many worker processes "
"in parallel, what schedule fits (cron, interval, manual-only), what " "should run in parallel (e.g. 1 for a digest, 5 for a fan-out), what "
"should the worker write into progress.db so the user can review " "schedule fits (cron, interval), what should the worker write into "
"results later, how to handle partial failures, what credentials or " "progress tracking(progress.db) so the user "
"MCP servers the worker needs that you haven't discussed. You don't " "can review results later, how to handle partial failures, what "
"credentials or MCP servers the worker needs that you haven't "
"discussed. You don't "
"need to cover every example — only the items that actually matter " "need to cover every example — only the items that actually matter "
"for this colony, and only the ones the user hasn't already implied. " "for this colony, and only the ones the user hasn't already implied. "
"Use ask_user (batch several questions into one call when you have " "Use ask_user (batch several questions into one call when you have "
@@ -151,13 +153,11 @@ class QueenPhaseState:
prompt_working: str = "" prompt_working: str = ""
prompt_reviewing: str = "" prompt_reviewing: str = ""
# Last-set incubation context (colony_name + intended_purpose), populated # Last-set incubation context, populated by start_incubating_colony when
# by start_incubating_colony when the evaluator approves. Read by # the evaluator approves. Read by get_current_prompt() to interpolate the
# get_current_prompt() to interpolate the colony name into the # colony name into the incubating role prompt so the queen sees the same
# incubating role prompt so the queen sees the same name across turns # name across turns without having to remember it from the tool result.
# without having to remember it from the tool result.
incubating_colony_name: str | None = None incubating_colony_name: str | None = None
incubating_intended_purpose: str | None = None
# Default skill operational protocols — appended to every phase prompt # Default skill operational protocols — appended to every phase prompt
protocols_prompt: str = "" protocols_prompt: str = ""
@@ -421,7 +421,6 @@ class QueenPhaseState:
self.phase = "independent" self.phase = "independent"
# Clear stale incubation context so a future incubation starts fresh. # Clear stale incubation context so a future incubation starts fresh.
self.incubating_colony_name = None self.incubating_colony_name = None
self.incubating_intended_purpose = None
tool_names = [t.name for t in self.independent_tools] tool_names = [t.name for t in self.independent_tools]
logger.info("Queen phase → independent (source=%s, tools: %s)", source, tool_names) logger.info("Queen phase → independent (source=%s, tools: %s)", source, tool_names)
await self._emit_phase_event() await self._emit_phase_event()
@@ -436,30 +435,25 @@ class QueenPhaseState:
self, self,
*, *,
colony_name: str, colony_name: str,
intended_purpose: str,
source: str = "tool", source: str = "tool",
) -> None: ) -> None:
"""Switch to incubating phase — queen drafts the colony spec. """Switch to incubating phase — queen drafts the colony spec.
Caller must already have validated colony_name. Stores the active Caller must already have validated colony_name. Stores the active
incubation context on self so get_current_prompt() can interpolate colony_name on self so get_current_prompt() can interpolate it on
it on every turn (the queen otherwise loses the colony_name after every turn (the queen otherwise loses the colony_name after the
the first tool result rolls past in the conversation history). first tool result rolls past in the conversation history).
Args: Args:
colony_name: Validated colony slug (lowercase alphanumeric + _). colony_name: Validated colony slug (lowercase alphanumeric + _).
intended_purpose: One-paragraph brief from the queen.
source: "tool", "frontend", or "auto". source: "tool", "frontend", or "auto".
""" """
if self.phase == "incubating": if self.phase == "incubating":
# Allow re-statement of context even when already incubating # Allow re-statement even when already incubating.
# the queen may have refined her intended_purpose mid-flight.
self.incubating_colony_name = colony_name self.incubating_colony_name = colony_name
self.incubating_intended_purpose = intended_purpose
return return
self.phase = "incubating" self.phase = "incubating"
self.incubating_colony_name = colony_name self.incubating_colony_name = colony_name
self.incubating_intended_purpose = intended_purpose
tool_names = [t.name for t in self.incubating_tools] tool_names = [t.name for t in self.incubating_tools]
logger.info( logger.info(
"Queen phase → incubating (source=%s, colony=%s, tools: %s)", "Queen phase → incubating (source=%s, colony=%s, tools: %s)",
@@ -2211,7 +2205,6 @@ def register_queen_lifecycle_tools(
async def start_incubating_colony( async def start_incubating_colony(
*, *,
colony_name: str, colony_name: str,
intended_purpose: str,
) -> str: ) -> str:
"""Gate the queen behind a one-shot readiness evaluator. """Gate the queen behind a one-shot readiness evaluator.
@@ -2233,18 +2226,6 @@ def register_queen_lifecycle_tools(
{"error": ("colony_name must be lowercase alphanumeric with underscores (e.g. 'morning_hn_digest').")} {"error": ("colony_name must be lowercase alphanumeric with underscores (e.g. 'morning_hn_digest').")}
) )
purpose = (intended_purpose or "").strip()
if not purpose:
return json.dumps(
{
"error": (
"intended_purpose is required — describe in one "
"paragraph what the colony will do, on what "
"cadence, and why it must outlive this chat."
)
}
)
phase_state = getattr(session, "phase_state", None) phase_state = getattr(session, "phase_state", None)
if phase_state is None: if phase_state is None:
return json.dumps({"error": "phase_state is not initialised on this session."}) return json.dumps({"error": "phase_state is not initialised on this session."})
@@ -2305,7 +2286,6 @@ def register_queen_lifecycle_tools(
llm=llm, llm=llm,
messages=messages, messages=messages,
colony_name=cn, colony_name=cn,
intended_purpose=purpose,
) )
if not verdict.get("ready"): if not verdict.get("ready"):
@@ -2323,10 +2303,9 @@ def register_queen_lifecycle_tools(
# Approved — flip phase. switch_to_incubating publishes # Approved — flip phase. switch_to_incubating publishes
# QUEEN_PHASE_CHANGED so the frontend badge updates and stores # QUEEN_PHASE_CHANGED so the frontend badge updates and stores
# the colony_name + purpose for the role prompt to interpolate. # the colony_name for the role prompt to interpolate.
await phase_state.switch_to_incubating( await phase_state.switch_to_incubating(
colony_name=cn, colony_name=cn,
intended_purpose=purpose,
source="tool", source="tool",
) )
@@ -2334,11 +2313,7 @@ def register_queen_lifecycle_tools(
{ {
"status": "incubating", "status": "incubating",
"colony_name": cn, "colony_name": cn,
"intended_purpose": purpose, "guidance": _INCUBATING_APPROVAL_GUIDANCE.format(colony_name=cn),
"guidance": _INCUBATING_APPROVAL_GUIDANCE.format(
colony_name=cn,
intended_purpose=purpose,
),
} }
) )
@@ -2378,17 +2353,8 @@ def register_queen_lifecycle_tools(
"'inbox_monitor')." "'inbox_monitor')."
), ),
}, },
"intended_purpose": {
"type": "string",
"description": (
"One-paragraph brief: what the colony will do, "
"on what cadence, why it must outlive this "
"chat. Do NOT write the SKILL.md here — that "
"happens in INCUBATING phase after approval."
),
},
}, },
"required": ["colony_name", "intended_purpose"], "required": ["colony_name"],
}, },
) )
registry.register( registry.register(