hive/core/framework/tasks/store.py

"""File-backed task store with filelock-based coordination.

Layout per list::

    {task_list_path}/tasks.json        -- TaskListDocument (meta + hwm + tasks)
    {task_list_path}/tasks.json.lock   -- list-level lock sentinel

Where ``task_list_path`` is:

    colony:{c}        -> ~/.hive/colonies/{c}/
    session:{a}:{s}   -> ~/.hive/agents/{a}/sessions/{s}/
    unscoped:{a}      -> ~/.hive/unscoped/{a}/
    {malformed}       -> ~/.hive/_misc/{slug}/

An older layout used the same root + a nested ``tasks/`` subdir holding
``meta.json``, ``.highwatermark``, ``.lock``, and ``NNNN.json`` per task.
That produced the ugly ``…/tasks/tasks/0001.json`` path. Migration is
lazy — the first lock-protected access on such a list folds the legacy
artifacts into ``tasks.json`` and unlinks them.

All filesystem I/O is wrapped in ``asyncio.to_thread`` so the event loop
never blocks. Locks use a ~3s budget — comfortable headroom for the only
realistic write contender (colony template under concurrent
``colony_template_*`` and ``run_parallel_workers`` stamps).
"""

from __future__ import annotations

import asyncio
import logging
import os
import shutil
import threading
import time
from collections.abc import Iterable
from pathlib import Path
from typing import Any

from filelock import FileLock

from framework.tasks.models import (
    ClaimAlreadyCompleted,
    ClaimAlreadyOwned,
    ClaimBlocked,
    ClaimNotFound,
    ClaimOk,
    ClaimResult,
    TaskListDocument,
    TaskListMeta,
    TaskListRole,
    TaskRecord,
    TaskStatus,
)
from framework.utils.io import atomic_write

logger = logging.getLogger(__name__)

LOCK_TIMEOUT_SECONDS = 3.0  # ~30 retries × ~100ms

DOC_FILENAME = "tasks.json"
LOCK_FILENAME = "tasks.json.lock"  # only colony lists (cross-process writers)

# Per-list in-memory locks for single-process scopes (session/unscoped/_misc).
# Sessions have one owning agent, so only same-process concurrency matters
# (e.g. parallel tool use within a single turn) — no on-disk lock needed.
_INPROC_LOCKS: dict[str, threading.Lock] = {}
_INPROC_LOCKS_GUARD = threading.Lock()


def _get_inproc_lock(task_list_id: str) -> threading.Lock:
    with _INPROC_LOCKS_GUARD:
        lock = _INPROC_LOCKS.get(task_list_id)
        if lock is None:
            lock = threading.Lock()
            _INPROC_LOCKS[task_list_id] = lock
        return lock


class _Unset:
    """Sentinel for "owner argument not provided" — distinct from owner=None."""

    __slots__ = ()


_UNSET_SENTINEL: _Unset = _Unset()


def _hive_root() -> Path:
    """Location of the hive data dir; honors HIVE_HOME for tests."""
    return Path(os.environ.get("HIVE_HOME", str(Path.home() / ".hive")))


def _find_queen_session_dir(session_id: str, *, hive_root: Path) -> Path | None:
    """Return ``agents/queens/{queen}/sessions/{session_id}`` if one exists.

    Queens live under ``QUEENS_DIR = hive_root / "agents" / "queens"`` (see
    ``framework.config``). The task system gets a generic ``agent_id ==
    "queen"`` in its ``task_list_id``, which would otherwise dead-end at
    ``agents/queen/...``, decoupled from the real session folder. By
    probing the canonical layout we keep the task doc beside conversations,
    events, summary, and meta for the same session.
    """
    queens_dir = hive_root / "agents" / "queens"
    if not queens_dir.exists():
        return None
    try:
        candidates = [d for d in queens_dir.iterdir() if d.is_dir()]
    except OSError:
        return None
    for queen_dir in candidates:
        candidate = queen_dir / "sessions" / session_id
        if candidate.is_dir():
            return candidate
    return None


def task_list_path(task_list_id: str, *, hive_root: Path | None = None) -> Path:
    """Resolve task_list_id -> directory containing ``tasks.json``.

    Note: this returns the *parent* of the doc file, not the file itself.
    For session/colony/unscoped lists, this is the agent or colony's home
    dir; the task doc is one filename inside it. (The older layout had an
    extra ``tasks/`` subdir under this path — see ``_legacy_root``.)

    For ``session:`` lists, the canonical queen session folder is preferred
    when it exists on disk: the task doc lives next to the rest of that
    session's data (conversations, events, summary).
    """
    root = hive_root or _hive_root()
    if task_list_id.startswith("colony:"):
        colony_id = task_list_id[len("colony:") :]
        return root / "colonies" / colony_id
    if task_list_id.startswith("session:"):
        rest = task_list_id[len("session:") :]
        agent_id, _, session_id = rest.partition(":")
        if not session_id:
            raise ValueError(f"Malformed session task_list_id: {task_list_id!r}")
        canonical = _find_queen_session_dir(session_id, hive_root=root)
        if canonical is not None:
            return canonical
        return root / "agents" / agent_id / "sessions" / session_id
    if task_list_id.startswith("unscoped:"):
        agent_id = task_list_id[len("unscoped:") :]
        return root / "unscoped" / agent_id
    # Last-ditch sanitization for HIVE_TASK_LIST_ID overrides — slugify the
    # whole thing so the test/dev path can't escape the hive root.
    safe = "".join(c if c.isalnum() or c in "-_" else "_" for c in task_list_id)
    return root / "_misc" / safe


def _legacy_root(task_list_id: str, *, hive_root: Path | None = None) -> Path:
    """Where the older artifacts (meta.json, .highwatermark, tasks/NNNN.json) lived.

    Pinned to the *pre-canonical* layout — for queen session lists this is
    ``agents/{agent_id}/sessions/{session_id}/tasks`` (i.e. the literal
    ``agent_id`` folder, not the canonical ``agents/queens/{queen}/...``
    path). The lazy migration reads from here and writes the new doc to
    wherever ``task_list_path`` resolves now.
    """
    root = hive_root or _hive_root()
    if task_list_id.startswith("colony:"):
        return root / "colonies" / task_list_id[len("colony:") :] / "tasks"
    if task_list_id.startswith("session:"):
        rest = task_list_id[len("session:") :]
        agent_id, _, session_id = rest.partition(":")
        return root / "agents" / agent_id / "sessions" / session_id / "tasks"
    if task_list_id.startswith("unscoped:"):
        return root / "unscoped" / task_list_id[len("unscoped:") :] / "tasks"
    # _misc fallback: legacy lived directly in the slug dir, same as the new parent.
    safe = "".join(c if c.isalnum() or c in "-_" else "_" for c in task_list_id)
    return root / "_misc" / safe


# ---------------------------------------------------------------------------
# TaskStore — public façade
# ---------------------------------------------------------------------------


class TaskStore:
    """Async wrapper around the on-disk store.

    A single TaskStore is fine to share across the process; locking is
    file-based, so even multiple processes are safe.
    """

    def __init__(self, *, hive_root: Path | None = None) -> None:
        self._hive_root = hive_root

    # ----- list-level ---------------------------------------------------

    async def ensure_task_list(
        self,
        task_list_id: str,
        *,
        role: TaskListRole,
        creator_agent_id: str | None = None,
        session_id: str | None = None,
    ) -> TaskListMeta:
        """Create a list if absent; if present, append session_id to last_seen.

        Idempotent: callers (ColonyRuntime bringup, lazy session creation)
        can call this every time.
        """
        return await asyncio.to_thread(
            self._ensure_task_list_sync,
            task_list_id,
            role,
            creator_agent_id,
            session_id,
        )

    async def list_exists(self, task_list_id: str) -> bool:
        """A list exists if its doc is on disk OR a legacy artifact is.

        The legacy fallback exists so that lists created under the older
        layout and not yet migrated still surface to the REST layer.
        """

        def _check() -> bool:
            if self._doc_path(task_list_id).exists():
                return True
            return self._has_legacy_artifacts(task_list_id)

        return await asyncio.to_thread(_check)

    async def get_meta(self, task_list_id: str) -> TaskListMeta | None:
        return await asyncio.to_thread(self._read_meta_sync, task_list_id)

    async def reset_task_list(self, task_list_id: str) -> None:
        """Delete all tasks but preserve the high-water-mark.

        Test helper. Never wired to runtime lifecycle.
        """
        await asyncio.to_thread(self._reset_sync, task_list_id)

    # ----- task CRUD ----------------------------------------------------

    async def create_tasks_batch(
        self,
        task_list_id: str,
        specs: list[dict[str, Any]],
    ) -> list[TaskRecord]:
        """Atomically create N tasks under a single list-lock acquisition.

        Each spec is a dict with keys: subject (required), description,
        active_form, owner, metadata. Ids are assigned sequentially and
        contiguously; if any spec is malformed the whole batch is
        rejected before any write. The doc model makes "atomic-or-none"
        free — we mutate one in-memory document and write it once.
        """
        return await asyncio.to_thread(
            self._create_tasks_batch_sync, task_list_id, specs
        )

    async def create_task(
        self,
        task_list_id: str,
        *,
        subject: str,
        description: str = "",
        active_form: str | None = None,
        owner: str | None = None,
        metadata: dict[str, Any] | None = None,
    ) -> TaskRecord:
        return await asyncio.to_thread(
            self._create_task_sync,
            task_list_id,
            subject,
            description,
            active_form,
            owner,
            metadata or {},
        )

    async def get_task(self, task_list_id: str, task_id: int) -> TaskRecord | None:
        return await asyncio.to_thread(self._read_task_sync, task_list_id, task_id)

    async def list_tasks(
        self,
        task_list_id: str,
        *,
        include_internal: bool = False,
    ) -> list[TaskRecord]:
        records = await asyncio.to_thread(self._list_tasks_sync, task_list_id)
        if include_internal:
            return records
        return [r for r in records if not r.metadata.get("_internal")]

    async def update_task(
        self,
        task_list_id: str,
        task_id: int,
        *,
        subject: str | None = None,
        description: str | None = None,
        active_form: str | None = None,
        owner: str | None | _Unset = _UNSET_SENTINEL,
        status: TaskStatus | None = None,
        add_blocks: list[int] | None = None,
        add_blocked_by: list[int] | None = None,
        metadata_patch: dict[str, Any] | None = None,
    ) -> tuple[TaskRecord | None, list[str]]:
        """Update a task; returns (new_record, fields_changed) or (None, [])."""
        return await asyncio.to_thread(
            self._update_task_sync,
            task_list_id,
            task_id,
            subject,
            description,
            active_form,
            owner,
            status,
            add_blocks,
            add_blocked_by,
            metadata_patch,
        )

    async def delete_task(self, task_list_id: str, task_id: int) -> tuple[bool, list[int]]:
        """Delete a task; returns (was_deleted, cascaded_ids).

        ``cascaded_ids`` are the ids of other tasks whose blocks/blocked_by
        referenced the deleted id and were stripped.
        """
        return await asyncio.to_thread(self._delete_task_sync, task_list_id, task_id)

    async def claim_task_with_busy_check(
        self,
        task_list_id: str,
        task_id: int,
        claimant: str,
    ) -> ClaimResult:
        """Atomic claim under list-lock.

        Used internally by ``run_parallel_workers`` when stamping
        ``metadata.assigned_session`` on colony template entries — not
        exposed to LLMs as a worker-facing claim race.
        """
        return await asyncio.to_thread(self._claim_sync, task_list_id, task_id, claimant)

    # =====================================================================
    # Sync internals — all called via asyncio.to_thread
    # =====================================================================

    def _list_dir(self, task_list_id: str) -> Path:
        return task_list_path(task_list_id, hive_root=self._hive_root)

    def _doc_path(self, task_list_id: str) -> Path:
        return self._list_dir(task_list_id) / DOC_FILENAME

    def _list_lock(self, task_list_id: str):
        """Return a context manager that serialises writes to this list.

        Colony template lists need a cross-process ``FileLock`` because
        ``run_parallel_workers`` spawns worker subprocesses that stamp
        completion back onto the template. Session/unscoped/_misc lists
        have a single owning agent — only same-process concurrency
        matters (e.g. parallel tool use within one turn), so an
        in-memory ``threading.Lock`` is enough and avoids the visible
        ``tasks.json.lock`` sentinel beside session folders.
        """
        d = self._list_dir(task_list_id)
        d.mkdir(parents=True, exist_ok=True)
        if task_list_id.startswith("colony:"):
            return FileLock(str(d / LOCK_FILENAME), timeout=LOCK_TIMEOUT_SECONDS)
        return _get_inproc_lock(task_list_id)

    def _legacy_dir(self, task_list_id: str) -> Path:
        return _legacy_root(task_list_id, hive_root=self._hive_root)

    def _legacy_meta_path(self, task_list_id: str) -> Path:
        return self._legacy_dir(task_list_id) / "meta.json"

    def _legacy_hwm_path(self, task_list_id: str) -> Path:
        return self._legacy_dir(task_list_id) / ".highwatermark"

    def _legacy_lock_path(self, task_list_id: str) -> Path:
        return self._legacy_dir(task_list_id) / ".lock"

    def _legacy_tasks_dir(self, task_list_id: str) -> Path:
        return self._legacy_dir(task_list_id) / "tasks"

    def _has_legacy_artifacts(self, task_list_id: str) -> bool:
        if self._legacy_meta_path(task_list_id).exists():
            return True
        td = self._legacy_tasks_dir(task_list_id)
        if td.exists():
            try:
                return any(p.suffix == ".json" for p in td.iterdir())
            except OSError:
                return False
        return False

    # ----- doc IO -------------------------------------------------------

    def _read_doc_sync(self, task_list_id: str) -> TaskListDocument | None:
        """Lock-free read for already-migrated lists; falls back to a
        lock-protected migration if only legacy artifacts exist.

        Returns None if the list doesn't exist on disk in either form.
        """
        doc_path = self._doc_path(task_list_id)
        if doc_path.exists():
            try:
                return TaskListDocument.model_validate_json(
                    doc_path.read_text(encoding="utf-8")
                )
            except Exception:
                logger.warning("Corrupt tasks.json at %s", doc_path, exc_info=True)
                # Fall through — legacy fallback may rescue us.

        if self._has_legacy_artifacts(task_list_id):
            with self._list_lock(task_list_id):
                # Re-check under lock: a parallel writer may have just
                # finished migrating, in which case we read the new doc.
                if doc_path.exists():
                    try:
                        return TaskListDocument.model_validate_json(
                            doc_path.read_text(encoding="utf-8")
                        )
                    except Exception:
                        logger.warning(
                            "Corrupt tasks.json at %s (post-lock)",
                            doc_path,
                            exc_info=True,
                        )
                doc = self._migrate_legacy_unsafe(task_list_id)
                if doc is not None:
                    self._write_doc_unsafe(task_list_id, doc)
                    self._cleanup_legacy_unsafe(task_list_id)
                return doc
        return None

    def _read_doc_unsafe(self, task_list_id: str) -> TaskListDocument | None:
        """Same as ``_read_doc_sync`` but assumes the list-lock is already
        held — used by methods that are already inside ``with self._list_lock``.
        Migration happens in-place without re-entering the lock.
        """
        doc_path = self._doc_path(task_list_id)
        if doc_path.exists():
            try:
                return TaskListDocument.model_validate_json(
                    doc_path.read_text(encoding="utf-8")
                )
            except Exception:
                logger.warning("Corrupt tasks.json at %s", doc_path, exc_info=True)
        if self._has_legacy_artifacts(task_list_id):
            doc = self._migrate_legacy_unsafe(task_list_id)
            if doc is not None:
                self._write_doc_unsafe(task_list_id, doc)
                self._cleanup_legacy_unsafe(task_list_id)
                return doc
        return None

    def _write_doc_unsafe(self, task_list_id: str, doc: TaskListDocument) -> None:
        """Atomically rewrite the doc. Caller MUST hold the list-lock."""
        path = self._doc_path(task_list_id)
        path.parent.mkdir(parents=True, exist_ok=True)
        with atomic_write(path) as f:
            f.write(doc.model_dump_json(indent=2))

    # ----- migration ----------------------------------------------------

    def _migrate_legacy_unsafe(self, task_list_id: str) -> TaskListDocument | None:
        """Fold legacy artifacts into a TaskListDocument. Caller MUST hold lock."""
        meta = self._read_legacy_meta(task_list_id)
        if meta is None:
            inferred_role = (
                TaskListRole.TEMPLATE
                if task_list_id.startswith("colony:")
                else TaskListRole.SESSION
            )
            meta = TaskListMeta(task_list_id=task_list_id, role=inferred_role)

        tasks: list[TaskRecord] = []
        td = self._legacy_tasks_dir(task_list_id)
        if td.exists():
            for p in sorted(td.iterdir()):
                if p.suffix != ".json":
                    continue
                try:
                    tasks.append(TaskRecord.model_validate_json(p.read_text(encoding="utf-8")))
                except Exception:
                    logger.warning(
                        "Skipping corrupt legacy task file %s during migration",
                        p,
                        exc_info=True,
                    )
        tasks.sort(key=lambda r: r.id)

        hwm = self._read_legacy_hwm(task_list_id)
        max_id = max((r.id for r in tasks), default=0)
        hwm = max(hwm, max_id)

        if not tasks and hwm == 0 and not self._legacy_meta_path(task_list_id).exists():
            return None

        return TaskListDocument(
            meta=meta,
            highwatermark=hwm,
            tasks=tasks,
        )

    def _read_legacy_meta(self, task_list_id: str) -> TaskListMeta | None:
        path = self._legacy_meta_path(task_list_id)
        if not path.exists():
            return None
        try:
            return TaskListMeta.model_validate_json(path.read_text(encoding="utf-8"))
        except Exception:
            logger.warning("Corrupt legacy meta.json at %s", path, exc_info=True)
            return None

    def _read_legacy_hwm(self, task_list_id: str) -> int:
        path = self._legacy_hwm_path(task_list_id)
        if not path.exists():
            return 0
        try:
            return int(path.read_text(encoding="utf-8").strip() or "0")
        except (ValueError, OSError):
            return 0

    def _cleanup_legacy_unsafe(self, task_list_id: str) -> None:
        """Remove the older layout's files. Caller MUST hold the list-lock.

        For session/colony/unscoped lists the legacy_dir is a dedicated
        ``tasks/`` subdir, so we remove the whole tree. For the ``_misc``
        fallback the legacy_dir is the same as the new parent dir — we
        delete only the specific legacy filenames so we don't clobber
        the new ``tasks.json``.
        """
        legacy = self._legacy_dir(task_list_id)
        if not legacy.exists():
            return

        if legacy != self._list_dir(task_list_id):
            try:
                shutil.rmtree(legacy)
            except OSError:
                logger.warning("Failed to remove legacy task dir %s", legacy, exc_info=True)
            return

        # _misc case: shared parent dir — surgical delete only.
        for p in (
            self._legacy_meta_path(task_list_id),
            self._legacy_hwm_path(task_list_id),
            self._legacy_lock_path(task_list_id),
        ):
            try:
                p.unlink(missing_ok=True)
            except OSError:
                logger.warning("Failed to remove %s", p, exc_info=True)
        td = self._legacy_tasks_dir(task_list_id)
        if td.exists():
            try:
                shutil.rmtree(td)
            except OSError:
                logger.warning("Failed to remove legacy tasks subdir %s", td, exc_info=True)

    # ----- meta accessors over the doc ----------------------------------

    def _ensure_task_list_sync(
        self,
        task_list_id: str,
        role: TaskListRole,
        creator_agent_id: str | None,
        session_id: str | None,
    ) -> TaskListMeta:
        with self._list_lock(task_list_id):
            doc = self._read_doc_unsafe(task_list_id)
            if doc is None:
                meta = TaskListMeta(
                    task_list_id=task_list_id,
                    role=role,
                    creator_agent_id=creator_agent_id,
                    last_seen_session_ids=[session_id] if session_id else [],
                )
                doc = TaskListDocument(meta=meta)
                self._write_doc_unsafe(task_list_id, doc)
                return meta

            meta = doc.meta
            if session_id and session_id not in meta.last_seen_session_ids:
                meta.last_seen_session_ids.append(session_id)
                # Cap at 10 to keep the audit trail bounded.
                meta.last_seen_session_ids = meta.last_seen_session_ids[-10:]
                self._write_doc_unsafe(task_list_id, doc)
            return meta

    def _read_meta_sync(self, task_list_id: str) -> TaskListMeta | None:
        doc = self._read_doc_sync(task_list_id)
        return doc.meta if doc is not None else None

    # ----- task IO ------------------------------------------------------

    def _read_task_sync(self, task_list_id: str, task_id: int) -> TaskRecord | None:
        doc = self._read_doc_sync(task_list_id)
        if doc is None:
            return None
        for r in doc.tasks:
            if r.id == task_id:
                return r
        return None

    def _list_tasks_sync(self, task_list_id: str) -> list[TaskRecord]:
        doc = self._read_doc_sync(task_list_id)
        if doc is None:
            return []
        return sorted(doc.tasks, key=lambda r: r.id)

    # ----- create -------------------------------------------------------

    def _create_task_sync(
        self,
        task_list_id: str,
        subject: str,
        description: str,
        active_form: str | None,
        owner: str | None,
        metadata: dict[str, Any],
    ) -> TaskRecord:
        with self._list_lock(task_list_id):
            doc = self._read_doc_unsafe(task_list_id)
            if doc is None:
                inferred_role = (
                    TaskListRole.TEMPLATE
                    if task_list_id.startswith("colony:")
                    else TaskListRole.SESSION
                )
                doc = TaskListDocument(
                    meta=TaskListMeta(task_list_id=task_list_id, role=inferred_role)
                )
            new_id = self._next_id_for_doc(doc)
            now = time.time()
            record = TaskRecord(
                id=new_id,
                subject=subject,
                description=description,
                active_form=active_form,
                owner=owner,
                status=TaskStatus.PENDING,
                metadata=metadata,
                created_at=now,
                updated_at=now,
            )
            doc.tasks.append(record)
            if new_id > doc.highwatermark:
                doc.highwatermark = new_id
            self._write_doc_unsafe(task_list_id, doc)
            return record

    def _create_tasks_batch_sync(
        self,
        task_list_id: str,
        specs: list[dict[str, Any]],
    ) -> list[TaskRecord]:
        if not specs:
            return []
        # Validate up-front so we don't half-create on a malformed entry.
        for i, spec in enumerate(specs):
            subj = spec.get("subject")
            if not isinstance(subj, str) or not subj.strip():
                raise ValueError(f"specs[{i}].subject must be a non-empty string")

        with self._list_lock(task_list_id):
            doc = self._read_doc_unsafe(task_list_id)
            if doc is None:
                inferred_role = (
                    TaskListRole.TEMPLATE
                    if task_list_id.startswith("colony:")
                    else TaskListRole.SESSION
                )
                doc = TaskListDocument(
                    meta=TaskListMeta(task_list_id=task_list_id, role=inferred_role)
                )

            base_id = self._next_id_for_doc(doc)
            now = time.time()
            records: list[TaskRecord] = []
            for offset, spec in enumerate(specs):
                rec = TaskRecord(
                    id=base_id + offset,
                    subject=spec["subject"],
                    description=spec.get("description", ""),
                    active_form=spec.get("active_form"),
                    owner=spec.get("owner"),
                    status=TaskStatus.PENDING,
                    metadata=dict(spec.get("metadata") or {}),
                    created_at=now,
                    updated_at=now,
                )
                records.append(rec)

            doc.tasks.extend(records)
            highest = records[-1].id
            if highest > doc.highwatermark:
                doc.highwatermark = highest
            # Single write — atomic batch is free with the doc model.
            self._write_doc_unsafe(task_list_id, doc)
            return records

    # ----- id assignment ------------------------------------------------

    def _next_id_for_doc(self, doc: TaskListDocument) -> int:
        max_existing = max((r.id for r in doc.tasks), default=0)
        return max(max_existing, doc.highwatermark) + 1

    # ----- update -------------------------------------------------------

    def _update_task_sync(
        self,
        task_list_id: str,
        task_id: int,
        subject: str | None,
        description: str | None,
        active_form: str | None,
        owner: str | None | _Unset,
        status: TaskStatus | None,
        add_blocks: list[int] | None,
        add_blocked_by: list[int] | None,
        metadata_patch: dict[str, Any] | None,
    ) -> tuple[TaskRecord | None, list[str]]:
        with self._list_lock(task_list_id):
            doc = self._read_doc_unsafe(task_list_id)
            if doc is None:
                return None, []
            target = next((r for r in doc.tasks if r.id == task_id), None)
            if target is None:
                return None, []
            new, changed = self._update_task_in_doc(doc, target, subject=subject,
                                                   description=description,
                                                   active_form=active_form,
                                                   owner=owner,
                                                   status=status,
                                                   add_blocks=add_blocks,
                                                   add_blocked_by=add_blocked_by,
                                                   metadata_patch=metadata_patch)
            if changed:
                self._write_doc_unsafe(task_list_id, doc)
            return new, changed

    def _update_task_in_doc(
        self,
        doc: TaskListDocument,
        current: TaskRecord,
        *,
        subject: str | None = None,
        description: str | None = None,
        active_form: str | None = None,
        owner: str | None | _Unset = _UNSET_SENTINEL,
        status: TaskStatus | None = None,
        add_blocks: list[int] | None = None,
        add_blocked_by: list[int] | None = None,
        metadata_patch: dict[str, Any] | None = None,
    ) -> tuple[TaskRecord, list[str]]:
        """Mutate ``current`` in place inside ``doc`` and return (record, changed).
        Bidirectional blocks/blocked_by also mutate the targets in ``doc``.
        """
        changed: list[str] = []

        if subject is not None and subject != current.subject:
            current.subject = subject
            changed.append("subject")
        if description is not None and description != current.description:
            current.description = description
            changed.append("description")
        if active_form is not None and active_form != current.active_form:
            current.active_form = active_form
            changed.append("active_form")
        if not isinstance(owner, _Unset) and owner != current.owner:
            current.owner = owner
            changed.append("owner")
        if status is not None and status != current.status:
            current.status = status
            changed.append("status")

        if add_blocks:
            for b in add_blocks:
                if b in current.blocks or b == current.id:
                    continue
                current.blocks.append(b)
                if "blocks" not in changed:
                    changed.append("blocks")
                target = next((r for r in doc.tasks if r.id == b), None)
                if target is not None and current.id not in target.blocked_by:
                    target.blocked_by.append(current.id)
                    target.updated_at = time.time()

        if add_blocked_by:
            for b in add_blocked_by:
                if b in current.blocked_by or b == current.id:
                    continue
                current.blocked_by.append(b)
                if "blocked_by" not in changed:
                    changed.append("blocked_by")
                target = next((r for r in doc.tasks if r.id == b), None)
                if target is not None and current.id not in target.blocks:
                    target.blocks.append(current.id)
                    target.updated_at = time.time()

        if metadata_patch is not None:
            md = dict(current.metadata)
            for k, v in metadata_patch.items():
                if v is None:
                    md.pop(k, None)
                else:
                    md[k] = v
            if md != current.metadata:
                current.metadata = md
                changed.append("metadata")

        if not changed:
            return current, []

        current.updated_at = time.time()
        return current, changed

    # ----- delete -------------------------------------------------------

    def _delete_task_sync(self, task_list_id: str, task_id: int) -> tuple[bool, list[int]]:
        with self._list_lock(task_list_id):
            doc = self._read_doc_unsafe(task_list_id)
            if doc is None:
                return False, []
            idx = next((i for i, r in enumerate(doc.tasks) if r.id == task_id), None)
            if idx is None:
                return False, []
            # 1. Bump high-water-mark BEFORE removing so a crash mid-write
            #    can't cause id reuse on the next create. (atomic_write
            #    guarantees we either commit the whole new state or none.)
            if task_id > doc.highwatermark:
                doc.highwatermark = task_id
            # 2. Remove the task itself.
            doc.tasks.pop(idx)
            # 3. Cascade: strip references from all other tasks.
            cascaded: list[int] = []
            now = time.time()
            for other in doc.tasks:
                touched = False
                if task_id in other.blocks:
                    other.blocks = [b for b in other.blocks if b != task_id]
                    touched = True
                if task_id in other.blocked_by:
                    other.blocked_by = [b for b in other.blocked_by if b != task_id]
                    touched = True
                if touched:
                    other.updated_at = now
                    cascaded.append(other.id)
            self._write_doc_unsafe(task_list_id, doc)
            return True, cascaded

    # ----- reset --------------------------------------------------------

    def _reset_sync(self, task_list_id: str) -> None:
        with self._list_lock(task_list_id):
            doc = self._read_doc_unsafe(task_list_id)
            if doc is None:
                return
            max_id = max((r.id for r in doc.tasks), default=0)
            doc.highwatermark = max(doc.highwatermark, max_id)
            doc.tasks = []
            self._write_doc_unsafe(task_list_id, doc)

    # ----- claim --------------------------------------------------------

    def _claim_sync(self, task_list_id: str, task_id: int, claimant: str) -> ClaimResult:
        with self._list_lock(task_list_id):
            doc = self._read_doc_unsafe(task_list_id)
            if doc is None:
                return ClaimNotFound(kind="not_found")
            current = next((r for r in doc.tasks if r.id == task_id), None)
            if current is None:
                return ClaimNotFound(kind="not_found")
            if current.status == TaskStatus.COMPLETED:
                return ClaimAlreadyCompleted(kind="already_completed")
            if current.owner is not None and current.owner != claimant:
                return ClaimAlreadyOwned(kind="already_owned", by=current.owner)
            unresolved_blockers: list[int] = []
            for b in current.blocked_by:
                blocker = next((r for r in doc.tasks if r.id == b), None)
                if blocker is not None and blocker.status != TaskStatus.COMPLETED:
                    unresolved_blockers.append(b)
            if unresolved_blockers:
                return ClaimBlocked(kind="blocked", by=unresolved_blockers)
            new, _ = self._update_task_in_doc(doc, current, owner=claimant)
            self._write_doc_unsafe(task_list_id, doc)
            return ClaimOk(kind="ok", record=new)


# ---------------------------------------------------------------------------
# Process-wide singleton (small, stateless wrapper)
# ---------------------------------------------------------------------------


_default_store: TaskStore | None = None


def get_task_store() -> TaskStore:
    """Process-wide default TaskStore (resolves HIVE_HOME at first call).

    Tests should construct a TaskStore directly with hive_root=tmp_path
    rather than relying on the singleton.
    """
    global _default_store
    if _default_store is None:
        _default_store = TaskStore()
    return _default_store


# Convenience for tests / utilities.
def fingerprint_for_test(task_list_id: str, hive_root: Path) -> Iterable[Path]:
    """Yield every task-list-related file — used by tests to assert
    byte-equivalence pre/post shutdown.

    Includes the doc + lock and any legacy leftovers (so this still works
    while a list is mid-migration).
    """
    files: list[Path] = []
    base = task_list_path(task_list_id, hive_root=hive_root)
    if not base.exists():
        return []
    doc = base / DOC_FILENAME
    if doc.exists():
        files.append(doc)
    lock = base / LOCK_FILENAME
    if lock.exists():
        files.append(lock)
    legacy = _legacy_root(task_list_id, hive_root=hive_root)
    if legacy.exists() and legacy != base:
        files.extend(sorted(legacy.rglob("*")))
    elif legacy.exists():
        # _misc fallback: include only legacy filenames
        for name in ("meta.json", ".highwatermark", ".lock"):
            p = legacy / name
            if p.exists():
                files.append(p)
        td = legacy / "tasks"
        if td.exists():
            files.extend(sorted(td.rglob("*")))
    return sorted(files)