feat: llm logging

2026-05-04 17:57:20 -07:00
parent 07600c5ab5
commit 07c97e2e9b
3 changed files with 155 additions and 1099 deletions
@@ -1,11 +1,22 @@
 """Write every LLM turn to ~/.hive/llm_logs/<ts>.jsonl for replay/debugging.
-Each line is a JSON object with the full LLM turn: the request payload
+Two record kinds, distinguished by ``_kind``:
-(system prompt + messages), assistant text, tool calls, tool results, and
+
-token counts. The file is opened lazily on first call and flushed after every
+* ``session_header`` — emitted on the first turn of an ``execution_id`` and
-write. Errors are silently swallowed — this must never break the agent.
+  any time its ``system_prompt`` or ``tools`` change. Carries those large
  fields once instead of per-turn.
 * ``turn`` — one per LLM call. Carries per-turn outputs plus a
  content-addressed message delta: ``message_hashes`` is the full ordered
  message sequence for this turn, ``new_messages`` is hash → body for
  messages we haven't emitted before for this ``execution_id``. The reader
  reassembles full ``messages`` by accumulating ``new_messages`` across
  prior turn records. Content-addressed (not positional) because the agent
  prunes messages mid-session — a tail-delta would be wrong.
 Errors are silently swallowed — this must never break the agent.
 """
 import hashlib
 import json
 import logging
 import os
@@ -28,6 +39,12 @@ def _llm_debug_dir() -> Path:
 _log_file: IO[str] | None = None
 _log_ready = False  # lazy init guard
 # Per-execution_id delta state. Reset implicitly on process restart — a fresh
 # log file has no prior context, so re-emitting the header on first turn is
 # correct.
 _session_header_hash: dict[str, str] = {}
 _session_seen_msgs: dict[str, set[str]] = {}
 def _open_log() -> IO[str] | None:
    """Open the JSONL log file for this process."""
@@ -61,6 +78,17 @@ def _serialize_tools(tools: Any) -> list[dict[str, Any]]:
    return out
 def _content_hash(payload: Any) -> str:
    raw = json.dumps(payload, default=str, sort_keys=True, ensure_ascii=False)
    return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:16]
 def _write_line(record: dict[str, Any]) -> None:
    assert _log_file is not None
    _log_file.write(json.dumps(record, default=str) + "\n")
    _log_file.flush()
 def log_llm_turn(
    *,
    node_id: str,
@@ -75,7 +103,7 @@ def log_llm_turn(
    token_counts: dict[str, Any],
    tools: list[Any] | None = None,
 ) -> None:
-    """Write one JSONL line capturing a complete LLM turn.
+    """Write JSONL records capturing one LLM turn (header + turn delta).
    Never raises.
    """
@@ -89,23 +117,57 @@ def log_llm_turn(
            _log_ready = True
        if _log_file is None:
            return
-        record = {
+
        # UTC + offset matches tool_call start_timestamp (agent_loop.py)
        # so the viewer can render every event in one consistent local zone.
-            "timestamp": datetime.now(UTC).isoformat(),
+        timestamp = datetime.now(UTC).isoformat()
        serialized_tools = _serialize_tools(tools)
        # Re-emit the header on first turn or whenever system/tools change.
        # The Queen reflects different prompts across turns, so we can't
        # assume strict immutability per execution_id.
        header_hash = _content_hash({"system_prompt": system_prompt, "tools": serialized_tools})
        if _session_header_hash.get(execution_id) != header_hash:
            _write_line(
                {
                    "_kind": "session_header",
                    "timestamp": timestamp,
                    "execution_id": execution_id,
                    "node_id": node_id,
                    "stream_id": stream_id,
-            "execution_id": execution_id,
+                    "header_hash": header_hash,
            "iteration": iteration,
                    "system_prompt": system_prompt,
-            "tools": _serialize_tools(tools),
+                    "tools": serialized_tools,
-            "messages": messages,
+                }
            )
            _session_header_hash[execution_id] = header_hash
        seen = _session_seen_msgs.setdefault(execution_id, set())
        message_hashes: list[str] = []
        new_messages: dict[str, dict[str, Any]] = {}
        for msg in messages or []:
            h = _content_hash(msg)
            message_hashes.append(h)
            if h not in seen:
                seen.add(h)
                new_messages[h] = msg
        _write_line(
            {
                "_kind": "turn",
                "timestamp": timestamp,
                "execution_id": execution_id,
                "node_id": node_id,
                "stream_id": stream_id,
                "iteration": iteration,
                "header_hash": header_hash,
                "message_hashes": message_hashes,
                "new_messages": new_messages,
                "assistant_text": assistant_text,
                "tool_calls": tool_calls,
                "tool_results": tool_results,
                "token_counts": token_counts,
            }
-        _log_file.write(json.dumps(record, default=str) + "\n")
+        )
        _log_file.flush()
    except Exception:
        pass  # never break the agent
@@ -64,6 +64,54 @@ def _format_timestamp(raw: str) -> str:
        return raw
 def _reassemble_records(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
    """Convert new-format (header + turn-delta) records into legacy-shape full turns.
    Records lacking ``_kind`` are passed through unchanged. Inputs must be in
    file order so headers precede the turns that reference them.
    """
    headers: dict[str, dict[str, Any]] = {}  # execution_id -> latest session_header
    pools: dict[str, dict[str, dict[str, Any]]] = {}  # execution_id -> hash -> message body
    out: list[dict[str, Any]] = []
    for rec in records:
        kind = rec.get("_kind")
        if kind == "session_header":
            eid = str(rec.get("execution_id") or "")
            headers[eid] = rec
            pools.setdefault(eid, {})
            continue
        if kind == "turn":
            eid = str(rec.get("execution_id") or "")
            pool = pools.setdefault(eid, {})
            new_msgs = rec.get("new_messages") or {}
            if isinstance(new_msgs, dict):
                pool.update(new_msgs)
            hashes = rec.get("message_hashes") or []
            messages = [pool[h] for h in hashes if h in pool]
            header = headers.get(eid, {})
            out.append(
                {
                    "timestamp": rec.get("timestamp", ""),
                    "execution_id": eid,
                    "node_id": rec.get("node_id", ""),
                    "stream_id": rec.get("stream_id", ""),
                    "iteration": rec.get("iteration", 0),
                    "system_prompt": header.get("system_prompt", ""),
                    "tools": header.get("tools", []),
                    "messages": messages,
                    "assistant_text": rec.get("assistant_text", ""),
                    "tool_calls": rec.get("tool_calls", []),
                    "tool_results": rec.get("tool_results", []),
                    "token_counts": rec.get("token_counts", {}),
                    "_log_file": rec.get("_log_file", ""),
                }
            )
            continue
        out.append(rec)
    return out
 def _is_test_session(execution_id: str, records: list[dict[str, Any]]) -> bool:
    if execution_id.startswith("<MagicMock"):
        return True
@@ -100,6 +148,9 @@ def _discover_session_summaries(logs_dir: Path, limit_files: int, include_tests:
                        payload = json.loads(line)
                    except json.JSONDecodeError:
                        continue
                    # session_header is metadata, not a turn — don't count it.
                    if payload.get("_kind") == "session_header":
                        continue
                    eid = str(payload.get("execution_id") or "").strip()
                    if not eid:
                        continue
@@ -157,6 +208,10 @@ def _load_session_data(logs_dir: Path, session_id: str, limit_files: int) -> lis
    records: list[dict[str, Any]] = []
    for path in files:
        # Reassemble per-file: each file is self-contained because the writer
        # re-emits the session_header on every process start, so we never need
        # cross-file state to fill in messages/system_prompt/tools.
        file_records: list[dict[str, Any]] = []
        try:
            with path.open(encoding="utf-8") as handle:
                for line_number, raw_line in enumerate(handle, start=1):
@@ -166,17 +221,23 @@ def _load_session_data(logs_dir: Path, session_id: str, limit_files: int) -> lis
                    try:
                        payload = json.loads(line)
                    except json.JSONDecodeError:
-                        payload = {
+                        records.append(
                            {
                                "timestamp": "",
                                "execution_id": "",
                                "_parse_error": f"{path.name}:{line_number}",
                                "_raw_line": line,
                                "_log_file": str(path),
                            }
-                    if str(payload.get("execution_id") or "").strip() == session_id:
+                        )
                        continue
                    if str(payload.get("execution_id") or "").strip() != session_id:
                        continue
                    payload["_log_file"] = str(path)
-                        records.append(payload)
+                    file_records.append(payload)
        except OSError:
            continue
        records.extend(_reassemble_records(file_records))
    if not records:
        return None