feat: llm logging

2026-05-04 17:57:20 -07:00
parent 07600c5ab5
commit 07c97e2e9b
3 changed files with 155 additions and 1099 deletions
@@ -1,11 +1,22 @@
 """Write every LLM turn to ~/.hive/llm_logs/<ts>.jsonl for replay/debugging.

-Each line is a JSON object with the full LLM turn: the request payload
-(system prompt + messages), assistant text, tool calls, tool results, and
-token counts. The file is opened lazily on first call and flushed after every
-write. Errors are silently swallowed — this must never break the agent.
+Two record kinds, distinguished by ``_kind``:
+
+* ``session_header`` — emitted on the first turn of an ``execution_id`` and
+  any time its ``system_prompt`` or ``tools`` change. Carries those large
+  fields once instead of per-turn.
+* ``turn`` — one per LLM call. Carries per-turn outputs plus a
+  content-addressed message delta: ``message_hashes`` is the full ordered
+  message sequence for this turn, ``new_messages`` is hash → body for
+  messages we haven't emitted before for this ``execution_id``. The reader
+  reassembles full ``messages`` by accumulating ``new_messages`` across
+  prior turn records. Content-addressed (not positional) because the agent
+  prunes messages mid-session — a tail-delta would be wrong.
+
+Errors are silently swallowed — this must never break the agent.
 """

+import hashlib
 import json
 import logging
 import os
@@ -28,6 +39,12 @@ def _llm_debug_dir() -> Path:
 _log_file: IO[str] | None = None
 _log_ready = False  # lazy init guard

+# Per-execution_id delta state. Reset implicitly on process restart — a fresh
+# log file has no prior context, so re-emitting the header on first turn is
+# correct.
+_session_header_hash: dict[str, str] = {}
+_session_seen_msgs: dict[str, set[str]] = {}
+

 def _open_log() -> IO[str] | None:
    """Open the JSONL log file for this process."""
@@ -61,6 +78,17 @@ def _serialize_tools(tools: Any) -> list[dict[str, Any]]:
    return out


+def _content_hash(payload: Any) -> str:
+    raw = json.dumps(payload, default=str, sort_keys=True, ensure_ascii=False)
+    return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]
+
+
+def _write_line(record: dict[str, Any]) -> None:
+    assert _log_file is not None
+    _log_file.write(json.dumps(record, default=str) + "\n")
+    _log_file.flush()
+
+
 def log_llm_turn(
    *,
    node_id: str,
@@ -75,7 +103,7 @@ def log_llm_turn(
    token_counts: dict[str, Any],
    tools: list[Any] | None = None,
 ) -> None:
-    """Write one JSONL line capturing a complete LLM turn.
+    """Write JSONL records capturing one LLM turn (header + turn delta).

    Never raises.
    """
@@ -89,23 +117,57 @@ def log_llm_turn(
            _log_ready = True
        if _log_file is None:
            return
-        record = {
+
        # UTC + offset matches tool_call start_timestamp (agent_loop.py)
        # so the viewer can render every event in one consistent local zone.
-            "timestamp": datetime.now(UTC).isoformat(),
+        timestamp = datetime.now(UTC).isoformat()
+        serialized_tools = _serialize_tools(tools)
+
+        # Re-emit the header on first turn or whenever system/tools change.
+        # The Queen reflects different prompts across turns, so we can't
+        # assume strict immutability per execution_id.
+        header_hash = _content_hash({"system_prompt": system_prompt, "tools": serialized_tools})
+        if _session_header_hash.get(execution_id) != header_hash:
+            _write_line(
+                {
+                    "_kind": "session_header",
+                    "timestamp": timestamp,
+                    "execution_id": execution_id,
                    "node_id": node_id,
                    "stream_id": stream_id,
-            "execution_id": execution_id,
-            "iteration": iteration,
+                    "header_hash": header_hash,
                    "system_prompt": system_prompt,
-            "tools": _serialize_tools(tools),
-            "messages": messages,
+                    "tools": serialized_tools,
+                }
+            )
+            _session_header_hash[execution_id] = header_hash
+
+        seen = _session_seen_msgs.setdefault(execution_id, set())
+        message_hashes: list[str] = []
+        new_messages: dict[str, dict[str, Any]] = {}
+        for msg in messages or []:
+            h = _content_hash(msg)
+            message_hashes.append(h)
+            if h not in seen:
+                seen.add(h)
+                new_messages[h] = msg
+
+        _write_line(
+            {
+                "_kind": "turn",
+                "timestamp": timestamp,
+                "execution_id": execution_id,
+                "node_id": node_id,
+                "stream_id": stream_id,
+                "iteration": iteration,
+                "header_hash": header_hash,
+                "message_hashes": message_hashes,
+                "new_messages": new_messages,
                "assistant_text": assistant_text,
                "tool_calls": tool_calls,
                "tool_results": tool_results,
                "token_counts": token_counts,
            }
-        _log_file.write(json.dumps(record, default=str) + "\n")
-        _log_file.flush()
+        )
    except Exception:
        pass  # never break the agent
@@ -64,6 +64,54 @@ def _format_timestamp(raw: str) -> str:
        return raw


+def _reassemble_records(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Convert new-format (header + turn-delta) records into legacy-shape full turns.
+
+    Records lacking ``_kind`` are passed through unchanged. Inputs must be in
+    file order so headers precede the turns that reference them.
+    """
+    headers: dict[str, dict[str, Any]] = {}  # execution_id -> latest session_header
+    pools: dict[str, dict[str, dict[str, Any]]] = {}  # execution_id -> hash -> message body
+
+    out: list[dict[str, Any]] = []
+    for rec in records:
+        kind = rec.get("_kind")
+        if kind == "session_header":
+            eid = str(rec.get("execution_id") or "")
+            headers[eid] = rec
+            pools.setdefault(eid, {})
+            continue
+        if kind == "turn":
+            eid = str(rec.get("execution_id") or "")
+            pool = pools.setdefault(eid, {})
+            new_msgs = rec.get("new_messages") or {}
+            if isinstance(new_msgs, dict):
+                pool.update(new_msgs)
+            hashes = rec.get("message_hashes") or []
+            messages = [pool[h] for h in hashes if h in pool]
+            header = headers.get(eid, {})
+            out.append(
+                {
+                    "timestamp": rec.get("timestamp", ""),
+                    "execution_id": eid,
+                    "node_id": rec.get("node_id", ""),
+                    "stream_id": rec.get("stream_id", ""),
+                    "iteration": rec.get("iteration", 0),
+                    "system_prompt": header.get("system_prompt", ""),
+                    "tools": header.get("tools", []),
+                    "messages": messages,
+                    "assistant_text": rec.get("assistant_text", ""),
+                    "tool_calls": rec.get("tool_calls", []),
+                    "tool_results": rec.get("tool_results", []),
+                    "token_counts": rec.get("token_counts", {}),
+                    "_log_file": rec.get("_log_file", ""),
+                }
+            )
+            continue
+        out.append(rec)
+    return out
+
+
 def _is_test_session(execution_id: str, records: list[dict[str, Any]]) -> bool:
    if execution_id.startswith("<MagicMock"):
        return True
@@ -100,6 +148,9 @@ def _discover_session_summaries(logs_dir: Path, limit_files: int, include_tests:
                        payload = json.loads(line)
                    except json.JSONDecodeError:
                        continue
+                    # session_header is metadata, not a turn — don't count it.
+                    if payload.get("_kind") == "session_header":
+                        continue
                    eid = str(payload.get("execution_id") or "").strip()
                    if not eid:
                        continue
@@ -157,6 +208,10 @@ def _load_session_data(logs_dir: Path, session_id: str, limit_files: int) -> lis

    records: list[dict[str, Any]] = []
    for path in files:
+        # Reassemble per-file: each file is self-contained because the writer
+        # re-emits the session_header on every process start, so we never need
+        # cross-file state to fill in messages/system_prompt/tools.
+        file_records: list[dict[str, Any]] = []
        try:
            with path.open(encoding="utf-8") as handle:
                for line_number, raw_line in enumerate(handle, start=1):
@@ -166,17 +221,23 @@ def _load_session_data(logs_dir: Path, session_id: str, limit_files: int) -> lis
                    try:
                        payload = json.loads(line)
                    except json.JSONDecodeError:
-                        payload = {
+                        records.append(
+                            {
                                "timestamp": "",
                                "execution_id": "",
                                "_parse_error": f"{path.name}:{line_number}",
                                "_raw_line": line,
+                                "_log_file": str(path),
                            }
-                    if str(payload.get("execution_id") or "").strip() == session_id:
+                        )
+                        continue
+                    if str(payload.get("execution_id") or "").strip() != session_id:
+                        continue
                    payload["_log_file"] = str(path)
-                        records.append(payload)
+                    file_records.append(payload)
        except OSError:
            continue
+        records.extend(_reassemble_records(file_records))

    if not records:
        return None