Merge branch 'feat/tasks-system'

This commit is contained in:
Richard Tang
2026-04-27 10:55:50 -07:00
7 changed files with 398 additions and 46 deletions
+22 -14
View File
@@ -226,9 +226,9 @@ another pass, kick it off with run_parallel_workers; otherwise stay \
conversational.
If the review itself is multi-step (e.g. "verify each worker's output, \
then draft a summary, then propose next steps"), you may use \
`task_create` / `task_update` to keep yourself organised. Skip them \
for a single-paragraph summary.
then draft a summary, then propose next steps"), lay it out upfront \
with `task_create_batch` and walk through with `task_update`. Skip the \
ceremony for a single-paragraph summary.
"""
@@ -240,10 +240,16 @@ _queen_tools_independent = """
# Tools (INDEPENDENT mode)
## Planning — use FIRST for multi-step work
- task_create / task_update / task_list / task_get When a request \
has 3+ atomic steps, your FIRST tool call is `task_create` (one task \
per step) BEFORE you touch any other tool. See "Independent execution" \
for the per-step flow and granularity rule.
- task_create_batch When a request has 3+ atomic steps, your FIRST \
tool call is `task_create_batch` with one entry per step (atomic, \
one round-trip). Use this for the upfront plan, NOT five separate \
`task_create` calls.
- task_create One-off mid-run additions when you discover \
unplanned work AFTER the initial plan is laid out.
- task_update / task_list / task_get Mark progress, inspect, or \
re-read state.
See "Independent execution" for the per-step flow and granularity rule.
## File I/O (coder-tools MCP)
- read_file, write_file, edit_file, hashline_edit, list_directory, \
@@ -413,19 +419,21 @@ _queen_behavior_independent = """
## Independent execution
You are the agent. **For multi-step work (3+ atomic actions): your FIRST \
tool call is `task_create`** one task per atomic action, before you \
touch any other tool. Then work the list one task at a time:
tool call is `task_create_batch`** with one entry per atomic action, \
before you touch any other tool. (One call, atomic not N separate \
`task_create` calls.) Then work the list one task at a time:
1. `task_update` in_progress before you start the step.
2. Do one real inline instance open the browser, call the real API, \
write to the real file. If the action is irreversible or touches \
shared systems, show and confirm before executing. Report concrete \
evidence (actual output, what worked / failed) after the run.
3. `task_update` completed THE MOMENT it's done. **Never batch up \
multiple completions to flush at the end.** `completed` transitions \
are the user's progress heartbeat in the right-rail panel — without \
them, the panel shows a hung spinner no matter how much real work \
you got done.
3. `task_update` completed THE MOMENT it's done. **Do not let \
multiple finished tasks pile up unmarked.** There is no batch update \
tool by design each `completed` transition is a discrete progress \
heartbeat in the user's right-rail panel. Without those transitions \
the panel shows a hung spinner no matter how much real work you got \
done.
**Granularity: one task per atomic action, not one umbrella per project.** \
Replying to 5 posts is 5 tasks, not 1. Crawling 3 sites is 3 tasks. \
+3 -2
View File
@@ -5,8 +5,9 @@ See temp/tasks-system-implementation-plan.md for the design. Two list types:
colony:{colony_id} -- the queen's spawn-plan template
session:{agent_id}:{sess_id} -- per-session working list
Each agent operates on its own session list via the four task tools
(`task_create`, `task_update`, `task_list`, `task_get`). The colony
Each agent operates on its own session list via the session task tools
(`task_create_batch`, `task_create`, `task_update`, `task_list`,
`task_get`). The colony
template is addressed only by the queen's `colony_template_*` tools and by
the UI/event surface.
"""
+3 -1
View File
@@ -86,7 +86,9 @@ def build_reminder(records: list[TaskRecord]) -> str:
"before starting the next step. Don't batch completions.",
" - If you've finished work that wasn't on the list, add a "
"task_create + task_update completed pair so the panel reflects it.",
" - If you're umbrella-tracking ('reply to all posts' as one task), break it into one task per atomic action.",
" - If you're umbrella-tracking ('reply to all posts' as one task), "
"break it into one task per atomic action — use `task_create_batch` "
"with one entry per action.",
]
if in_progress:
bullets.append(
+85
View File
@@ -164,6 +164,27 @@ class TaskStore:
# ----- task CRUD ----------------------------------------------------
async def create_tasks_batch(
self,
task_list_id: str,
specs: list[dict[str, Any]],
) -> list[TaskRecord]:
"""Atomically create N tasks under a single list-lock acquisition.
Each spec is a dict with keys: subject (required), description,
active_form, owner, metadata. Ids are assigned sequentially and
contiguously if any task fails to write, an exception is raised
and the whole batch is rolled back (file unlinked, high-water-mark
kept at the prior value).
Atomic-or-none semantics matter for the tool surface: a failed
partial batch would leave the LLM reasoning about cleanup, which
defeats the point of batching as a single decision.
"""
return await asyncio.to_thread(
self._create_tasks_batch_sync, task_list_id, specs
)
async def create_task(
self,
task_list_id: str,
@@ -432,6 +453,70 @@ class TaskStore:
self._write_highwatermark_sync(task_list_id, new_id)
return record
def _create_tasks_batch_sync(
self,
task_list_id: str,
specs: list[dict[str, Any]],
) -> list[TaskRecord]:
if not specs:
return []
# Validate up-front so we don't half-create on a malformed entry.
for i, spec in enumerate(specs):
subj = spec.get("subject")
if not isinstance(subj, str) or not subj.strip():
raise ValueError(f"specs[{i}].subject must be a non-empty string")
with self._list_lock(task_list_id):
# Same lazy meta backfill as _create_task_sync.
if not self._meta_path(task_list_id).exists():
inferred_role = (
TaskListRole.TEMPLATE
if task_list_id.startswith("colony:")
else TaskListRole.SESSION
)
self._write_meta_sync(
task_list_id,
TaskListMeta(task_list_id=task_list_id, role=inferred_role),
)
base_id = self._next_id_sync(task_list_id)
now = time.time()
records: list[TaskRecord] = []
for offset, spec in enumerate(specs):
rec = TaskRecord(
id=base_id + offset,
subject=spec["subject"],
description=spec.get("description", ""),
active_form=spec.get("active_form"),
owner=spec.get("owner"),
status=TaskStatus.PENDING,
metadata=dict(spec.get("metadata") or {}),
created_at=now,
updated_at=now,
)
records.append(rec)
# Write all task files; on any failure, unlink everything we
# wrote so far and re-raise. High-water-mark is bumped only
# after a successful full-batch write.
written: list[Path] = []
try:
for rec in records:
self._write_task_sync(task_list_id, rec)
written.append(self._task_path(task_list_id, rec.id))
except Exception:
for path in written:
try:
path.unlink(missing_ok=True)
except OSError:
logger.warning("Failed to roll back batch task at %s", path, exc_info=True)
raise
highest = records[-1].id
if highest > self._read_highwatermark_sync(task_list_id):
self._write_highwatermark_sync(task_list_id, highest)
return records
# ----- update -------------------------------------------------------
def _update_task_sync(
+127
View File
@@ -159,6 +159,133 @@ async def test_task_not_found_is_not_error(
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_create_batch_creates_n_tasks_atomically(
registry_with_session_tools: ToolRegistry,
) -> None:
reg = registry_with_session_tools
list_id = "session:agent_a:sess_1"
token = _set_ctx(agent_id="agent_a", task_list_id=list_id)
try:
result = await _invoke(
reg,
"task_create_batch",
tasks=[
{"subject": "step 1", "active_form": "Doing 1"},
{"subject": "step 2"},
{"subject": "step 3"},
],
)
assert result.is_error is False
body = json.loads(result.content)
assert body["success"] is True
assert body["task_ids"] == [1, 2, 3]
# Compact summary message — references first id and the range.
assert "#1-#3" in body["message"] or "#1, #2, #3" in body["message"]
assert "Mark #1 in_progress" in body["message"]
# Sanity: list shows all three.
body2 = json.loads((await _invoke(reg, "task_list")).content)
assert body2["count"] == 3
finally:
ToolRegistry.reset_execution_context(token)
@pytest.mark.asyncio
async def test_create_batch_rejects_empty(
registry_with_session_tools: ToolRegistry,
) -> None:
reg = registry_with_session_tools
token = _set_ctx(agent_id="a", task_list_id="session:a:s")
try:
result = await _invoke(reg, "task_create_batch", tasks=[])
body = json.loads(result.content)
assert body["success"] is False
assert "non-empty" in body["error"]
finally:
ToolRegistry.reset_execution_context(token)
@pytest.mark.asyncio
async def test_create_batch_rejects_malformed_spec_atomically(
registry_with_session_tools: ToolRegistry,
) -> None:
"""A bad subject in the middle of the batch must reject the whole
batch not leave partial state on disk."""
reg = registry_with_session_tools
token = _set_ctx(agent_id="a", task_list_id="session:a:s")
try:
result = await _invoke(
reg,
"task_create_batch",
tasks=[{"subject": "good"}, {"subject": ""}],
)
body = json.loads(result.content)
assert body["success"] is False
# Confirm zero tasks landed.
body2 = json.loads((await _invoke(reg, "task_list")).content)
assert body2["count"] == 0
finally:
ToolRegistry.reset_execution_context(token)
@pytest.mark.asyncio
async def test_create_batch_hook_blocks_rolls_back_whole_batch(
registry_with_session_tools: ToolRegistry,
) -> None:
"""If a task_created hook blocks even one task in the batch, the
entire batch must roll back."""
reg = registry_with_session_tools
# Block on the second task only.
def selective_blocker(ctx) -> None:
if ctx.task.subject == "block me":
raise BlockingHookError("policy")
register_hook(HOOK_TASK_CREATED, selective_blocker)
token = _set_ctx(agent_id="a", task_list_id="session:a:s")
try:
result = await _invoke(
reg,
"task_create_batch",
tasks=[
{"subject": "ok 1"},
{"subject": "block me"},
{"subject": "ok 3"},
],
)
body = json.loads(result.content)
assert body["success"] is False
assert "rolled back" in body["error"]
# All three rolled back.
body2 = json.loads((await _invoke(reg, "task_list")).content)
assert body2["count"] == 0
finally:
ToolRegistry.reset_execution_context(token)
@pytest.mark.asyncio
async def test_create_batch_then_single_create_keeps_id_monotonic(
registry_with_session_tools: ToolRegistry,
) -> None:
"""task_create_batch uses sequential ids; a follow-up task_create
should pick up at the next id after the batch's highest."""
reg = registry_with_session_tools
token = _set_ctx(agent_id="a", task_list_id="session:a:s")
try:
await _invoke(
reg,
"task_create_batch",
tasks=[{"subject": "a"}, {"subject": "b"}, {"subject": "c"}],
)
result = await _invoke(reg, "task_create", subject="d")
body = json.loads(result.content)
assert body["task_id"] == 4
finally:
ToolRegistry.reset_execution_context(token)
@pytest.mark.asyncio
async def test_completion_suffix_points_to_next_pending(
registry_with_session_tools: ToolRegistry,
+139 -13
View File
@@ -117,31 +117,68 @@ def _get_schema() -> dict[str, Any]:
}
def _create_batch_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"tasks": {
"type": "array",
"minItems": 1,
"description": (
"Array of task specs. Each becomes one task with a "
"sequential id. Atomic — all created or none."
),
"items": {
"type": "object",
"properties": {
"subject": {
"type": "string",
"description": "Imperative title (e.g. 'Crawl target URL').",
},
"description": {"type": "string"},
"active_form": {
"type": "string",
"description": (
"Present-continuous label shown while in_progress."
),
},
"metadata": {"type": "object"},
},
"required": ["subject"],
},
}
},
"required": ["tasks"],
}
# ---------------------------------------------------------------------------
# Tool descriptions
# ---------------------------------------------------------------------------
_CREATE_DESC = (
"Create a task on your own session task list to break down and track "
"multi-step work. Use when you have 3+ distinct steps, non-trivial "
"planning, or the user explicitly asks for tracked progress. Capture "
"tasks IMMEDIATELY after receiving instructions — don't narrate intent. "
"DO NOT use this for: a single trivial task, purely conversational "
"replies, greetings, or work that fits in one tool call. The user "
"sees this list live in the right rail.\n\n"
"Create ONE task on your own session task list. Use this for one-off "
"mid-run additions when you discover unplanned work after the initial "
"plan is laid out.\n\n"
"**For laying out a multi-step plan upfront, use `task_create_batch` "
"instead** — one tool call with all the steps is cheaper and atomic.\n\n"
"Fields:\n"
"- subject: short imperative title (e.g. 'Crawl target URLs').\n"
"- subject: short imperative title (e.g. 'Crawl target URL').\n"
"- description: optional, slightly longer 'what to do' note.\n"
"- active_form: present-continuous label shown while in_progress (e.g. "
"'Crawling target URLs'). If omitted, the spinner shows the subject.\n"
"'Crawling target URL'). If omitted, the spinner shows the subject.\n"
"- metadata: optional KV. Set _internal=true to hide from task_list."
)
_UPDATE_DESC = (
"Update a task on your own session task list. Workflow:\n"
"Update ONE task on your own session task list. There is no batch "
"update tool by design — every `completed` transition is a discrete "
"progress signal to the user.\n\n"
"Workflow:\n"
"- Mark a task `in_progress` BEFORE you start working on it.\n"
"- Mark it `completed` AS SOON as you finish it — never batch up "
" multiple completions to flush at the end.\n"
"- Mark it `completed` AS SOON as you finish it — do not let "
"multiple finished tasks pile up unmarked before flushing them at "
"the end of the run.\n"
"- Set status='deleted' to drop a task that's no longer relevant.\n\n"
"ONLY mark `completed` when the task is FULLY done. If you hit errors, "
"blockers, or partial state, keep it `in_progress` and create a new "
@@ -163,6 +200,15 @@ _GET_DESC = (
"task before updating it if you're not sure of current fields."
)
_CREATE_BATCH_DESC = (
"Create N tasks at once on your own session task list. **Use this "
"FIRST when laying out a multi-step plan upfront** — replying to 5 "
"posts is one `task_create_batch` with 5 entries, not 5 separate "
"`task_create` calls. Atomic: all-or-none. Use single `task_create` "
"for one-off mid-run additions when you discover unplanned work, "
"not for the initial plan."
)
# ---------------------------------------------------------------------------
# Executors
@@ -227,6 +273,77 @@ def _make_create_executor(store: TaskStore):
return execute
def _make_create_batch_executor(store: TaskStore):
async def execute(inputs: dict) -> dict[str, Any]:
list_id = _resolve_list_id()
if not list_id:
return {"success": False, "error": "No task_list_id resolved for this agent."}
agent_id = current_agent_id() or ""
specs = inputs.get("tasks") or []
if not isinstance(specs, list) or not specs:
return {
"success": False,
"error": "task_create_batch requires a non-empty `tasks` array.",
}
# Storage layer validates subject; surface its error as a soft
# tool_result so sibling tools don't cancel.
try:
recs = await store.create_tasks_batch(list_id, specs)
except ValueError as exc:
return {"success": False, "error": str(exc)}
# Run task_created hooks per task; blocking on any aborts the
# whole batch (delete every record we just wrote, return error).
for rec in recs:
try:
await run_task_hooks(
HOOK_TASK_CREATED,
task_list_id=list_id,
task=rec,
agent_id=agent_id,
)
except BlockingHookError as exc:
logger.warning(
"task_created hook blocked batch on task #%s: %s",
rec.id,
exc,
)
for r in recs:
await store.delete_task(list_id, r.id)
return {
"success": False,
"error": (
f"Hook blocked task #{rec.id} ({rec.subject!r}); "
f"entire batch rolled back: {exc}"
),
}
for rec in recs:
await emit_task_created(task_list_id=list_id, record=rec)
ids = [r.id for r in recs]
# Compact summary message — don't flood the conversation with
# one line per created task.
if len(ids) == 1:
range_label = f"#{ids[0]}"
elif ids == list(range(ids[0], ids[-1] + 1)):
range_label = f"#{ids[0]}-#{ids[-1]}"
else:
range_label = ", ".join(f"#{i}" for i in ids)
return {
"success": True,
"task_list_id": list_id,
"task_ids": ids,
"message": (
f"Created {len(ids)} task(s): {range_label}. "
f"Mark #{ids[0]} in_progress before starting it."
),
"tasks": [_serialize_task(r) for r in recs],
}
return execute
def _make_update_executor(store: TaskStore):
async def execute(inputs: dict) -> dict[str, Any]:
list_id = _resolve_list_id()
@@ -426,9 +543,18 @@ class _OwnerSentinel: # noqa: N801 — internal sentinel class
def build_session_tools(
store: TaskStore | None = None,
) -> list[tuple[Tool, Any]]:
"""Build (Tool, executor) pairs for the four session task tools."""
"""Build (Tool, executor) pairs for the session task tools."""
s = store or get_task_store()
return [
(
Tool(
name="task_create_batch",
description=_CREATE_BATCH_DESC,
parameters=_create_batch_schema(),
concurrency_safe=False,
),
_make_create_batch_executor(s),
),
(
Tool(
name="task_create",
+19 -16
View File
@@ -108,6 +108,25 @@ function TaskListPanelInner({ title, variant = "rail", onClose }: TaskListPanelP
</p>
) : (
<>
{/* Completed sits above Active so finished tasks stay visually
* "above" the work that came after them — preserves the order
* the user originally saw before the status flipped. */}
<Section
label="Completed"
count={buckets.completed.length}
open={completedOpen}
onToggle={() => setCompletedOpen((v) => !v)}
>
{buckets.completed.map((t) => (
<RefItem
key={t.id}
task={t}
itemRefs={itemRefs}
unresolved={[]}
onJumpToBlocker={handleJumpToBlocker}
/>
))}
</Section>
<Section
label="Active"
count={buckets.active.length}
@@ -140,22 +159,6 @@ function TaskListPanelInner({ title, variant = "rail", onClose }: TaskListPanelP
/>
))}
</Section>
<Section
label="Completed"
count={buckets.completed.length}
open={completedOpen}
onToggle={() => setCompletedOpen((v) => !v)}
>
{buckets.completed.map((t) => (
<RefItem
key={t.id}
task={t}
itemRefs={itemRefs}
unresolved={[]}
onJumpToBlocker={handleJumpToBlocker}
/>
))}
</Section>
</>
)}
</div>