feat: multi task creation

2026-04-27 10:35:02 -07:00
parent da361f735d
commit cb1484be85
6 changed files with 379 additions and 30 deletions
@@ -226,9 +226,9 @@ another pass, kick it off with run_parallel_workers; otherwise stay \
 conversational.
 If the review itself is multi-step (e.g. "verify each worker's output, \
-then draft a summary, then propose next steps"), you may use \
+then draft a summary, then propose next steps"), lay it out upfront \
-`task_create` / `task_update` to keep yourself organised. Skip them \
+with `task_create_batch` and walk through with `task_update`. Skip the \
-for a single-paragraph summary.
+ceremony for a single-paragraph summary.
 """
@@ -240,10 +240,16 @@ _queen_tools_independent = """
 # Tools (INDEPENDENT mode)
 ## Planning — use FIRST for multi-step work
- task_create / task_update / task_list / task_get — When a request \
+- task_create_batch — When a request has 3+ atomic steps, your FIRST \
-has 3+ atomic steps, your FIRST tool call is `task_create` (one task \
+tool call is `task_create_batch` with one entry per step (atomic, \
-per step) BEFORE you touch any other tool. See "Independent execution" \
+one round-trip). Use this for the upfront plan, NOT five separate \
-for the per-step flow and granularity rule.
+`task_create` calls.
 - task_create — One-off mid-run additions when you discover \
 unplanned work AFTER the initial plan is laid out.
 - task_update / task_list / task_get — Mark progress, inspect, or \
 re-read state.
 See "Independent execution" for the per-step flow and granularity rule.
 ## File I/O (coder-tools MCP)
 - read_file, write_file, edit_file, hashline_edit, list_directory, \
@@ -413,19 +419,21 @@ _queen_behavior_independent = """
 ## Independent execution
 You are the agent. **For multi-step work (3+ atomic actions): your FIRST \
-tool call is `task_create`** — one task per atomic action, before you \
+tool call is `task_create_batch`** with one entry per atomic action, \
-touch any other tool. Then work the list one task at a time:
+before you touch any other tool. (One call, atomic — not N separate \
 `task_create` calls.) Then work the list one task at a time:
 1. `task_update` → in_progress before you start the step.
 2. Do one real inline instance — open the browser, call the real API, \
 write to the real file. If the action is irreversible or touches \
 shared systems, show and confirm before executing. Report concrete \
 evidence (actual output, what worked / failed) after the run.
-3. `task_update` → completed THE MOMENT it's done. **Never batch up \
+3. `task_update` → completed THE MOMENT it's done. **Do not let \
-multiple completions to flush at the end.** `completed` transitions \
+multiple finished tasks pile up unmarked.** There is no batch update \
-are the user's progress heartbeat in the right-rail panel — without \
+tool by design — each `completed` transition is a discrete progress \
-them, the panel shows a hung spinner no matter how much real work \
+heartbeat in the user's right-rail panel. Without those transitions \
-you got done.
+the panel shows a hung spinner no matter how much real work you got \
 done.
 **Granularity: one task per atomic action, not one umbrella per project.** \
 Replying to 5 posts is 5 tasks, not 1. Crawling 3 sites is 3 tasks. \
@@ -5,8 +5,9 @@ See temp/tasks-system-implementation-plan.md for the design. Two list types:
    colony:{colony_id}            -- the queen's spawn-plan template
    session:{agent_id}:{sess_id}  -- per-session working list
-Each agent operates on its own session list via the four task tools
+Each agent operates on its own session list via the session task tools
-(`task_create`, `task_update`, `task_list`, `task_get`). The colony
+(`task_create_batch`, `task_create`, `task_update`, `task_list`,
 `task_get`). The colony
 template is addressed only by the queen's `colony_template_*` tools and by
 the UI/event surface.
 """
@@ -86,7 +86,9 @@ def build_reminder(records: list[TaskRecord]) -> str:
        "before starting the next step. Don't batch completions.",
        "  - If you've finished work that wasn't on the list, add a "
        "task_create + task_update completed pair so the panel reflects it.",
-        "  - If you're umbrella-tracking ('reply to all posts' as one task), break it into one task per atomic action.",
+        "  - If you're umbrella-tracking ('reply to all posts' as one task), "
        "break it into one task per atomic action — use `task_create_batch` "
        "with one entry per action.",
    ]
    if in_progress:
        bullets.append(
@@ -164,6 +164,27 @@ class TaskStore:
    # ----- task CRUD ----------------------------------------------------
    async def create_tasks_batch(
        self,
        task_list_id: str,
        specs: list[dict[str, Any]],
    ) -> list[TaskRecord]:
        """Atomically create N tasks under a single list-lock acquisition.
        Each spec is a dict with keys: subject (required), description,
        active_form, owner, metadata. Ids are assigned sequentially and
        contiguously — if any task fails to write, an exception is raised
        and the whole batch is rolled back (file unlinked, high-water-mark
        kept at the prior value).
        Atomic-or-none semantics matter for the tool surface: a failed
        partial batch would leave the LLM reasoning about cleanup, which
        defeats the point of batching as a single decision.
        """
        return await asyncio.to_thread(
            self._create_tasks_batch_sync, task_list_id, specs
        )
    async def create_task(
        self,
        task_list_id: str,
@@ -432,6 +453,70 @@ class TaskStore:
                self._write_highwatermark_sync(task_list_id, new_id)
            return record
    def _create_tasks_batch_sync(
        self,
        task_list_id: str,
        specs: list[dict[str, Any]],
    ) -> list[TaskRecord]:
        if not specs:
            return []
        # Validate up-front so we don't half-create on a malformed entry.
        for i, spec in enumerate(specs):
            subj = spec.get("subject")
            if not isinstance(subj, str) or not subj.strip():
                raise ValueError(f"specs[{i}].subject must be a non-empty string")
        with self._list_lock(task_list_id):
            # Same lazy meta backfill as _create_task_sync.
            if not self._meta_path(task_list_id).exists():
                inferred_role = (
                    TaskListRole.TEMPLATE
                    if task_list_id.startswith("colony:")
                    else TaskListRole.SESSION
                )
                self._write_meta_sync(
                    task_list_id,
                    TaskListMeta(task_list_id=task_list_id, role=inferred_role),
                )
            base_id = self._next_id_sync(task_list_id)
            now = time.time()
            records: list[TaskRecord] = []
            for offset, spec in enumerate(specs):
                rec = TaskRecord(
                    id=base_id + offset,
                    subject=spec["subject"],
                    description=spec.get("description", ""),
                    active_form=spec.get("active_form"),
                    owner=spec.get("owner"),
                    status=TaskStatus.PENDING,
                    metadata=dict(spec.get("metadata") or {}),
                    created_at=now,
                    updated_at=now,
                )
                records.append(rec)
            # Write all task files; on any failure, unlink everything we
            # wrote so far and re-raise. High-water-mark is bumped only
            # after a successful full-batch write.
            written: list[Path] = []
            try:
                for rec in records:
                    self._write_task_sync(task_list_id, rec)
                    written.append(self._task_path(task_list_id, rec.id))
            except Exception:
                for path in written:
                    try:
                        path.unlink(missing_ok=True)
                    except OSError:
                        logger.warning("Failed to roll back batch task at %s", path, exc_info=True)
                raise
            highest = records[-1].id
            if highest > self._read_highwatermark_sync(task_list_id):
                self._write_highwatermark_sync(task_list_id, highest)
            return records
    # ----- update -------------------------------------------------------
    def _update_task_sync(
@@ -159,6 +159,133 @@ async def test_task_not_found_is_not_error(
 # ---------------------------------------------------------------------------
@pytest.mark.asyncio
 async def test_create_batch_creates_n_tasks_atomically(
    registry_with_session_tools: ToolRegistry,
 ) -> None:
    reg = registry_with_session_tools
    list_id = "session:agent_a:sess_1"
    token = _set_ctx(agent_id="agent_a", task_list_id=list_id)
    try:
        result = await _invoke(
            reg,
            "task_create_batch",
            tasks=[
                {"subject": "step 1", "active_form": "Doing 1"},
                {"subject": "step 2"},
                {"subject": "step 3"},
            ],
        )
        assert result.is_error is False
        body = json.loads(result.content)
        assert body["success"] is True
        assert body["task_ids"] == [1, 2, 3]
        # Compact summary message — references first id and the range.
        assert "#1-#3" in body["message"] or "#1, #2, #3" in body["message"]
        assert "Mark #1 in_progress" in body["message"]
        # Sanity: list shows all three.
        body2 = json.loads((await _invoke(reg, "task_list")).content)
        assert body2["count"] == 3
    finally:
        ToolRegistry.reset_execution_context(token)
@pytest.mark.asyncio
 async def test_create_batch_rejects_empty(
    registry_with_session_tools: ToolRegistry,
 ) -> None:
    reg = registry_with_session_tools
    token = _set_ctx(agent_id="a", task_list_id="session:a:s")
    try:
        result = await _invoke(reg, "task_create_batch", tasks=[])
        body = json.loads(result.content)
        assert body["success"] is False
        assert "non-empty" in body["error"]
    finally:
        ToolRegistry.reset_execution_context(token)
@pytest.mark.asyncio
 async def test_create_batch_rejects_malformed_spec_atomically(
    registry_with_session_tools: ToolRegistry,
 ) -> None:
    """A bad subject in the middle of the batch must reject the whole
    batch — not leave partial state on disk."""
    reg = registry_with_session_tools
    token = _set_ctx(agent_id="a", task_list_id="session:a:s")
    try:
        result = await _invoke(
            reg,
            "task_create_batch",
            tasks=[{"subject": "good"}, {"subject": ""}],
        )
        body = json.loads(result.content)
        assert body["success"] is False
        # Confirm zero tasks landed.
        body2 = json.loads((await _invoke(reg, "task_list")).content)
        assert body2["count"] == 0
    finally:
        ToolRegistry.reset_execution_context(token)
@pytest.mark.asyncio
 async def test_create_batch_hook_blocks_rolls_back_whole_batch(
    registry_with_session_tools: ToolRegistry,
 ) -> None:
    """If a task_created hook blocks even one task in the batch, the
    entire batch must roll back."""
    reg = registry_with_session_tools
    # Block on the second task only.
    def selective_blocker(ctx) -> None:
        if ctx.task.subject == "block me":
            raise BlockingHookError("policy")
    register_hook(HOOK_TASK_CREATED, selective_blocker)
    token = _set_ctx(agent_id="a", task_list_id="session:a:s")
    try:
        result = await _invoke(
            reg,
            "task_create_batch",
            tasks=[
                {"subject": "ok 1"},
                {"subject": "block me"},
                {"subject": "ok 3"},
            ],
        )
        body = json.loads(result.content)
        assert body["success"] is False
        assert "rolled back" in body["error"]
        # All three rolled back.
        body2 = json.loads((await _invoke(reg, "task_list")).content)
        assert body2["count"] == 0
    finally:
        ToolRegistry.reset_execution_context(token)
@pytest.mark.asyncio
 async def test_create_batch_then_single_create_keeps_id_monotonic(
    registry_with_session_tools: ToolRegistry,
 ) -> None:
    """task_create_batch uses sequential ids; a follow-up task_create
    should pick up at the next id after the batch's highest."""
    reg = registry_with_session_tools
    token = _set_ctx(agent_id="a", task_list_id="session:a:s")
    try:
        await _invoke(
            reg,
            "task_create_batch",
            tasks=[{"subject": "a"}, {"subject": "b"}, {"subject": "c"}],
        )
        result = await _invoke(reg, "task_create", subject="d")
        body = json.loads(result.content)
        assert body["task_id"] == 4
    finally:
        ToolRegistry.reset_execution_context(token)
@pytest.mark.asyncio
 async def test_completion_suffix_points_to_next_pending(
    registry_with_session_tools: ToolRegistry,
@@ -117,31 +117,68 @@ def _get_schema() -> dict[str, Any]:
    }
 def _create_batch_schema() -> dict[str, Any]:
    return {
        "type": "object",
        "properties": {
            "tasks": {
                "type": "array",
                "minItems": 1,
                "description": (
                    "Array of task specs. Each becomes one task with a "
                    "sequential id. Atomic — all created or none."
                ),
                "items": {
                    "type": "object",
                    "properties": {
                        "subject": {
                            "type": "string",
                            "description": "Imperative title (e.g. 'Crawl target URL').",
                        },
                        "description": {"type": "string"},
                        "active_form": {
                            "type": "string",
                            "description": (
                                "Present-continuous label shown while in_progress."
                            ),
                        },
                        "metadata": {"type": "object"},
                    },
                    "required": ["subject"],
                },
            }
        },
        "required": ["tasks"],
    }
 # ---------------------------------------------------------------------------
 # Tool descriptions
 # ---------------------------------------------------------------------------
 _CREATE_DESC = (
-    "Create a task on your own session task list to break down and track "
+    "Create ONE task on your own session task list. Use this for one-off "
-    "multi-step work. Use when you have 3+ distinct steps, non-trivial "
+    "mid-run additions when you discover unplanned work after the initial "
-    "planning, or the user explicitly asks for tracked progress. Capture "
+    "plan is laid out.\n\n"
-    "tasks IMMEDIATELY after receiving instructions — don't narrate intent. "
+    "**For laying out a multi-step plan upfront, use `task_create_batch` "
-    "DO NOT use this for: a single trivial task, purely conversational "
+    "instead** — one tool call with all the steps is cheaper and atomic.\n\n"
    "replies, greetings, or work that fits in one tool call. The user "
    "sees this list live in the right rail.\n\n"
    "Fields:\n"
-    "- subject: short imperative title (e.g. 'Crawl target URLs').\n"
+    "- subject: short imperative title (e.g. 'Crawl target URL').\n"
    "- description: optional, slightly longer 'what to do' note.\n"
    "- active_form: present-continuous label shown while in_progress (e.g. "
-    "'Crawling target URLs'). If omitted, the spinner shows the subject.\n"
+    "'Crawling target URL'). If omitted, the spinner shows the subject.\n"
    "- metadata: optional KV. Set _internal=true to hide from task_list."
 )
 _UPDATE_DESC = (
-    "Update a task on your own session task list. Workflow:\n"
+    "Update ONE task on your own session task list. There is no batch "
    "update tool by design — every `completed` transition is a discrete "
    "progress signal to the user.\n\n"
    "Workflow:\n"
    "- Mark a task `in_progress` BEFORE you start working on it.\n"
-    "- Mark it `completed` AS SOON as you finish it — never batch up "
+    "- Mark it `completed` AS SOON as you finish it — do not let "
-    "  multiple completions to flush at the end.\n"
+    "multiple finished tasks pile up unmarked before flushing them at "
    "the end of the run.\n"
    "- Set status='deleted' to drop a task that's no longer relevant.\n\n"
    "ONLY mark `completed` when the task is FULLY done. If you hit errors, "
    "blockers, or partial state, keep it `in_progress` and create a new "
@@ -163,6 +200,15 @@ _GET_DESC = (
    "task before updating it if you're not sure of current fields."
 )
 _CREATE_BATCH_DESC = (
    "Create N tasks at once on your own session task list. **Use this "
    "FIRST when laying out a multi-step plan upfront** — replying to 5 "
    "posts is one `task_create_batch` with 5 entries, not 5 separate "
    "`task_create` calls. Atomic: all-or-none. Use single `task_create` "
    "for one-off mid-run additions when you discover unplanned work, "
    "not for the initial plan."
 )
 # ---------------------------------------------------------------------------
 # Executors
@@ -227,6 +273,77 @@ def _make_create_executor(store: TaskStore):
    return execute
 def _make_create_batch_executor(store: TaskStore):
    async def execute(inputs: dict) -> dict[str, Any]:
        list_id = _resolve_list_id()
        if not list_id:
            return {"success": False, "error": "No task_list_id resolved for this agent."}
        agent_id = current_agent_id() or ""
        specs = inputs.get("tasks") or []
        if not isinstance(specs, list) or not specs:
            return {
                "success": False,
                "error": "task_create_batch requires a non-empty `tasks` array.",
            }
        # Storage layer validates subject; surface its error as a soft
        # tool_result so sibling tools don't cancel.
        try:
            recs = await store.create_tasks_batch(list_id, specs)
        except ValueError as exc:
            return {"success": False, "error": str(exc)}
        # Run task_created hooks per task; blocking on any aborts the
        # whole batch (delete every record we just wrote, return error).
        for rec in recs:
            try:
                await run_task_hooks(
                    HOOK_TASK_CREATED,
                    task_list_id=list_id,
                    task=rec,
                    agent_id=agent_id,
                )
            except BlockingHookError as exc:
                logger.warning(
                    "task_created hook blocked batch on task #%s: %s",
                    rec.id,
                    exc,
                )
                for r in recs:
                    await store.delete_task(list_id, r.id)
                return {
                    "success": False,
                    "error": (
                        f"Hook blocked task #{rec.id} ({rec.subject!r}); "
                        f"entire batch rolled back: {exc}"
                    ),
                }
        for rec in recs:
            await emit_task_created(task_list_id=list_id, record=rec)
        ids = [r.id for r in recs]
        # Compact summary message — don't flood the conversation with
        # one line per created task.
        if len(ids) == 1:
            range_label = f"#{ids[0]}"
        elif ids == list(range(ids[0], ids[-1] + 1)):
            range_label = f"#{ids[0]}-#{ids[-1]}"
        else:
            range_label = ", ".join(f"#{i}" for i in ids)
        return {
            "success": True,
            "task_list_id": list_id,
            "task_ids": ids,
            "message": (
                f"Created {len(ids)} task(s): {range_label}. "
                f"Mark #{ids[0]} in_progress before starting it."
            ),
            "tasks": [_serialize_task(r) for r in recs],
        }
    return execute
 def _make_update_executor(store: TaskStore):
    async def execute(inputs: dict) -> dict[str, Any]:
        list_id = _resolve_list_id()
@@ -426,9 +543,18 @@ class _OwnerSentinel:  # noqa: N801 — internal sentinel class
 def build_session_tools(
    store: TaskStore | None = None,
 ) -> list[tuple[Tool, Any]]:
-    """Build (Tool, executor) pairs for the four session task tools."""
+    """Build (Tool, executor) pairs for the session task tools."""
    s = store or get_task_store()
    return [
        (
            Tool(
                name="task_create_batch",
                description=_CREATE_BATCH_DESC,
                parameters=_create_batch_schema(),
                concurrency_safe=False,
            ),
            _make_create_batch_executor(s),
        ),
        (
            Tool(
                name="task_create",