feat: robust run id

2026-04-02 12:35:16 -07:00
parent 00c55d5fb2
commit 60d094464a
8 changed files with 105 additions and 63 deletions
@@ -11,6 +11,11 @@ from typing import Any, Literal, Protocol, runtime_checkable
 LEGACY_RUN_ID = "__legacy_run__"


+def is_legacy_run_id(run_id: str | None) -> bool:
+    """True when run_id represents pre-migration (no run boundary) data."""
+    return run_id is None or run_id == LEGACY_RUN_ID
+
+
@dataclass
 class Message:
    """A single message in a conversation.
@@ -1131,17 +1136,28 @@ class NodeConversation:
        await self._write_next_seq()

    async def _persist_meta(self) -> None:
-        """Lazily write conversation metadata to the store (called once)."""
+        """Lazily write conversation metadata to the store (called once).
+
+        When ``self._run_id`` is set, metadata is keyed under
+        ``meta["runs"][run_id]`` so multiple runs can coexist in the same
+        session.  Legacy (no run_id) sessions write flat for backward compat.
+        """
        if self._store is None:
            return
-        await self._store.write_meta(
-            {
+        run_meta = {
            "system_prompt": self._system_prompt,
            "max_context_tokens": self._max_context_tokens,
            "compaction_threshold": self._compaction_threshold,
            "output_keys": self._output_keys,
        }
-        )
+        if self._run_id:
+            existing = await self._store.read_meta() or {}
+            runs = dict(existing.get("runs", {}))
+            runs[self._run_id] = run_meta
+            existing["runs"] = runs
+            await self._store.write_meta(existing)
+        else:
+            await self._store.write_meta(run_meta)
        self._meta_persisted = True

    async def _write_next_seq(self) -> None:
@@ -1175,6 +1191,12 @@ class NodeConversation:
        if meta is None:
            return None

+        # Extract run-scoped metadata when available
+        if run_id and isinstance(meta.get("runs"), dict):
+            run_meta = meta["runs"].get(run_id)
+            if run_meta is not None:
+                meta = run_meta
+
        conv = cls(
            system_prompt=meta.get("system_prompt", ""),
            max_context_tokens=meta.get("max_context_tokens", 32000),
@@ -1187,8 +1209,8 @@ class NodeConversation:

        parts = await store.read_parts()
        if run_id is not None:
-            if run_id == LEGACY_RUN_ID:
-                parts = [p for p in parts if p.get("run_id") in (None, LEGACY_RUN_ID)]
+            if is_legacy_run_id(run_id):
+                parts = [p for p in parts if is_legacy_run_id(p.get("run_id"))]
            else:
                parts = [p for p in parts if p.get("run_id") == run_id]
        if phase_id:
@@ -61,17 +61,17 @@ async def restore(
    conversation = await NodeConversation.restore(
        conversation_store,
        phase_id=phase_filter,
-        run_id=ctx.run_id or None,
+        run_id=ctx.effective_run_id,
    )
    if conversation is None:
        return None

-    accumulator = await OutputAccumulator.restore(conversation_store, run_id=ctx.run_id or None)
+    accumulator = await OutputAccumulator.restore(conversation_store, run_id=ctx.effective_run_id)
    accumulator.spillover_dir = config.spillover_dir
    accumulator.max_value_chars = config.max_output_value_chars

    cursor = await conversation_store.read_cursor()
-    run_cursor = get_run_cursor(cursor, ctx.run_id or None)
+    run_cursor = get_run_cursor(cursor, ctx.effective_run_id)
    start_iteration = run_cursor.get("iteration", 0) + 1 if run_cursor else 0

    # Restore stall/doom-loop detection state
@@ -128,7 +128,7 @@ async def write_cursor(
            run_cursor["recent_tool_fingerprints"] = [
                [list(pair) for pair in fps] for fps in recent_tool_fingerprints
            ]
-        await conversation_store.write_cursor(update_run_cursor(cursor, ctx.run_id or None, run_cursor))
+        await conversation_store.write_cursor(update_run_cursor(cursor, ctx.effective_run_id, run_cursor))


 async def drain_injection_queue(
@@ -367,7 +367,7 @@ class EventLoopNode(NodeProtocol):
                store=self._conversation_store,
                spillover_dir=self._config.spillover_dir,
                max_value_chars=self._config.max_output_value_chars,
-                run_id=ctx.run_id or None,
+                run_id=ctx.effective_run_id,
            )
            start_iteration = 0
            _restored_recent_responses: list[str] = []
@@ -418,6 +418,12 @@ class EventLoopNode(NodeProtocol):
                if conversation.system_prompt != _current_prompt:
                    conversation.update_system_prompt(_current_prompt)
                    logger.info("Refreshed system prompt for restored conversation")
+
+                # Refresh other meta fields that may differ across runs
+                conversation._max_context_tokens = self._config.max_context_tokens
+                if ctx.node_spec.output_keys:
+                    conversation._output_keys = ctx.node_spec.output_keys
+                conversation._meta_persisted = False  # Force re-persist with updated values
            else:
                _restored_recent_responses = []
                _restored_tool_fingerprints = []
@@ -481,7 +487,7 @@ class EventLoopNode(NodeProtocol):
                    max_context_tokens=self._config.max_context_tokens,
                    output_keys=ctx.node_spec.output_keys or None,
                    store=self._conversation_store,
-                    run_id=ctx.run_id or None,
+                    run_id=ctx.effective_run_id,
                )
                # Stamp phase for first node in continuous mode
                if _is_continuous:
@@ -490,7 +496,7 @@ class EventLoopNode(NodeProtocol):
                    store=self._conversation_store,
                    spillover_dir=self._config.spillover_dir,
                    max_value_chars=self._config.max_output_value_chars,
-                    run_id=ctx.run_id or None,
+                    run_id=ctx.effective_run_id,
                )
                start_iteration = 0

@@ -481,6 +481,16 @@ class NodeContext:
    execution_id: str = ""
    run_id: str = ""

+    @property
+    def effective_run_id(self) -> str | None:
+        """Normalized run_id: returns run_id if truthy, otherwise None.
+
+        The field defaults to ``""``; callers should use this property
+        instead of ``self.run_id or None`` to avoid silently falling
+        back to session-scoped storage.
+        """
+        return self.run_id or None
+
    # Stream identity — the ExecutionStream this node runs within.
    # Falls back to node_id when not set (legacy / standalone executor).
    stream_id: str = ""
@@ -250,7 +250,10 @@ def register_commands(subparsers: argparse._SubParsersAction) -> None:
 def _load_resume_state(
    agent_path: str, session_id: str, checkpoint_id: str | None = None
 ) -> dict | None:
-    """Load session or checkpoint state for headless resume.
+    """Load checkpoint state for headless resume.
+
+    All resumes require a checkpoint. If ``checkpoint_id`` is not provided
+    the latest checkpoint is auto-discovered.

    Args:
        agent_path: Path to the agent folder (e.g., exports/my_agent)
@@ -258,7 +261,7 @@ def _load_resume_state(
        checkpoint_id: Optional checkpoint ID within the session

    Returns:
-        session_state dict for executor, or None if not found
+        session_state dict for executor, or None if no checkpoint found
    """
    agent_name = Path(agent_path).name
    agent_work_dir = Path.home() / ".hive" / "agents" / agent_name
@@ -267,8 +270,20 @@ def _load_resume_state(
    if not session_dir.exists():
        return None

-    if checkpoint_id:
-        # Checkpoint-based resume: load checkpoint and extract state
+    # Auto-discover latest checkpoint when not specified
+    if not checkpoint_id:
+        cp_dir = session_dir / "checkpoints"
+        if cp_dir.exists():
+            checkpoints = sorted(
+                cp_dir.glob("*.json"),
+                key=lambda p: p.stat().st_mtime,
+                reverse=True,
+            )
+            if checkpoints:
+                checkpoint_id = checkpoints[0].stem
+        if not checkpoint_id:
+            return None
+
    cp_path = session_dir / "checkpoints" / f"{checkpoint_id}.json"
    if not cp_path.exists():
        return None
@@ -276,30 +291,15 @@ def _load_resume_state(
        cp_data = json.loads(cp_path.read_text(encoding="utf-8"))
    except (json.JSONDecodeError, OSError):
        return None
+
    return {
        "resume_session_id": session_id,
+        "resume_from_checkpoint": checkpoint_id,
+        "run_id": cp_data.get("run_id") or None,
        "data_buffer": cp_data.get("data_buffer", cp_data.get("shared_memory", {})),
        "paused_at": cp_data.get("next_node") or cp_data.get("current_node"),
        "execution_path": cp_data.get("execution_path", []),
-            "node_visit_counts": {},
-        }
-    else:
-        # Session state resume: load state.json
-        state_path = session_dir / "state.json"
-        if not state_path.exists():
-            return None
-        try:
-            state_data = json.loads(state_path.read_text(encoding="utf-8"))
-        except (json.JSONDecodeError, OSError):
-            return None
-        progress = state_data.get("progress", {})
-        paused_at = progress.get("paused_at") or progress.get("resume_from")
-        return {
-            "resume_session_id": session_id,
-            "data_buffer": state_data.get("data_buffer", state_data.get("memory", {})),
-            "paused_at": paused_at,
-            "execution_path": progress.get("path", []),
-            "node_visit_counts": progress.get("node_visit_counts", {}),
+        "node_visit_counts": cp_data.get("node_visit_counts", {}),
    }


@@ -169,11 +169,10 @@ class SessionState(BaseModel):
    def is_resumable(self) -> bool:
        """Can this session be resumed?

-        Every non-completed session is resumable. If resume_from/paused_at
-        aren't set, the executor falls back to the graph entry point —
-        so we don't gate on those. Even catastrophic failures are resumable.
+        Only sessions with a valid checkpoint can be resumed.
+        State-based resume (without a checkpoint) is no longer supported.
        """
-        return self.status != SessionStatus.COMPLETED
+        return self.is_resumable_from_checkpoint

    @computed_field
    @property
@@ -294,7 +293,11 @@ class SessionState(BaseModel):
        )

    def to_session_state_dict(self) -> dict[str, Any]:
-        """Convert to session_state format for GraphExecutor.execute()."""
+        """Convert to session_state format for GraphExecutor.execute().
+
+        NOTE: state-based resume via paused_at/resume_from is deprecated.
+        Use checkpoint-based resume (``resume_from_checkpoint`` key) instead.
+        """
        # Derive resume target: explicit > last node in path > entry point
        resume_from = (
            self.progress.resume_from
@@ -29,7 +29,7 @@ import shutil
 from pathlib import Path
 from typing import Any

-from framework.graph.conversation import LEGACY_RUN_ID
+from framework.graph.conversation import LEGACY_RUN_ID, is_legacy_run_id


 class FileConversationStore:
@@ -109,8 +109,8 @@ class FileConversationStore:
                        continue
                    data = self._read_json(f) or {}
                    part_run_id = data.get("run_id")
-                    if run_id == LEGACY_RUN_ID:
-                        if part_run_id in (None, LEGACY_RUN_ID):
+                    if is_legacy_run_id(run_id):
+                        if is_legacy_run_id(part_run_id):
                            f.unlink()
                    elif part_run_id == run_id:
                        f.unlink()
@@ -12,6 +12,7 @@ from framework.graph.conversation import (
    Message,
    NodeConversation,
    extract_tool_call_history,
+    is_legacy_run_id,
 )
 from framework.storage.conversation_store import FileConversationStore

@@ -55,8 +56,8 @@ class MockConversationStore:
            if run_id is None:
                continue
            part_run_id = value.get("run_id")
-            if run_id == LEGACY_RUN_ID:
-                if part_run_id not in (None, LEGACY_RUN_ID):
+            if is_legacy_run_id(run_id):
+                if not is_legacy_run_id(part_run_id):
                    kept[key] = value
            elif part_run_id != run_id:
                kept[key] = value