fix: isolate session loading

This commit is contained in:
Timothy
2026-02-24 11:02:58 -08:00
parent 28a71b70a8
commit 3963855d1d
6 changed files with 96 additions and 18910 deletions
+48
View File
@@ -328,6 +328,20 @@ class LiteLLMProvider(LLMProvider):
f"Full request dumped to: {dump_path}"
)
# finish_reason=length means the model exhausted max_tokens
# before producing content. Retrying with the same max_tokens
# will never help — return immediately instead of looping.
if finish_reason == "length":
max_tok = kwargs.get("max_tokens", "unset")
logger.error(
f"[retry] {model} returned empty content with "
f"finish_reason=length (max_tokens={max_tok}). "
f"The model exhausted its token budget before "
f"producing visible output. Increase max_tokens "
f"or use a different model. Not retrying."
)
return response
if attempt == retries:
logger.error(
f"[retry] GAVE UP on {model} after {retries + 1} "
@@ -621,6 +635,20 @@ class LiteLLMProvider(LLMProvider):
f"Full request dumped to: {dump_path}"
)
# finish_reason=length means the model exhausted max_tokens
# before producing content. Retrying with the same max_tokens
# will never help — return immediately instead of looping.
if finish_reason == "length":
max_tok = kwargs.get("max_tokens", "unset")
logger.error(
f"[async-retry] {model} returned empty content with "
f"finish_reason=length (max_tokens={max_tok}). "
f"The model exhausted its token budget before "
f"producing visible output. Increase max_tokens "
f"or use a different model. Not retrying."
)
return response
if attempt == retries:
logger.error(
f"[async-retry] GAVE UP on {model} after {retries + 1} "
@@ -903,6 +931,7 @@ class LiteLLMProvider(LLMProvider):
tool_calls_acc: dict[int, dict[str, str]] = {}
input_tokens = 0
output_tokens = 0
stream_finish_reason: str | None = None
try:
response = await litellm.acompletion(**kwargs) # type: ignore[union-attr]
@@ -938,6 +967,7 @@ class LiteLLMProvider(LLMProvider):
# --- Finish ---
if choice.finish_reason:
stream_finish_reason = choice.finish_reason
for _idx, tc_data in sorted(tool_calls_acc.items()):
try:
parsed_args = json.loads(tc_data["arguments"])
@@ -992,6 +1022,24 @@ class LiteLLMProvider(LLMProvider):
for event in tail_events:
yield event
return
# finish_reason=length means the model exhausted
# max_tokens before producing content. Retrying with
# the same max_tokens will never help.
if stream_finish_reason == "length":
max_tok = kwargs.get("max_tokens", "unset")
logger.error(
f"[stream] {self.model} returned empty content "
f"with finish_reason=length "
f"(max_tokens={max_tok}). The model exhausted "
f"its token budget before producing visible "
f"output. Increase max_tokens or use a "
f"different model. Not retrying."
)
for event in tail_events:
yield event
return
wait = _compute_retry_delay(attempt)
token_count, token_method = _estimate_tokens(
self.model,
+19 -15
View File
@@ -61,7 +61,7 @@ async def _extract_subgraph_steps(nodes: list, llm: Any) -> None:
response = await llm.acomplete(
messages=[{"role": "user", "content": prompt}],
max_tokens=1000,
max_tokens=4096,
json_mode=True,
)
@@ -172,13 +172,6 @@ class AgentManager:
if runner._agent_runtime is None:
await loop.run_in_executor(None, runner._setup)
# Extract subgraph steps for frontend visualization (non-critical)
if runner.graph and runner._llm:
try:
await _extract_subgraph_steps(runner.graph.nodes, runner._llm)
except Exception as e:
logger.warning(f"Subgraph extraction skipped: {e}")
runtime = runner._agent_runtime
# Start runtime on event loop
@@ -224,6 +217,9 @@ class AgentManager:
- **Judge**: timer-driven background GraphExecutor (silent monitoring)
- **Worker**: the existing AgentRuntime (unchanged)
"""
import uuid
from datetime import datetime
from framework.graph.executor import GraphExecutor
from framework.monitoring import judge_goal, judge_graph
from framework.runner.tool_registry import ToolRegistry
@@ -238,6 +234,12 @@ class AgentManager:
event_bus = runtime._event_bus
llm = runtime._llm
# Generate a shared session ID for queen, judge, and worker.
# All three use the same ID so conversations are scoped to this
# agent load and start fresh each time.
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
session_id = f"session_{ts}_{uuid.uuid4().hex[:8]}"
# 1. Monitoring tools — standalone registry, NOT merged into worker
monitoring_registry = ToolRegistry()
register_worker_monitoring_tools(
@@ -247,14 +249,15 @@ class AgentManager:
worker_graph_id=runtime._graph_id,
)
# 2. Storage dirs
judge_dir = storage_path / "graphs" / "worker_health_judge" / "session"
# 2. Storage dirs — scoped by session_id so each agent load
# gets fresh queen/judge conversations.
judge_dir = storage_path / "graphs" / "judge" / "session" / session_id
judge_dir.mkdir(parents=True, exist_ok=True)
queen_dir = storage_path / "graphs" / "queen" / "session"
queen_dir = storage_path / "graphs" / "queen" / "session" / session_id
queen_dir.mkdir(parents=True, exist_ok=True)
# 3. Health judge — background task, fires every 2 minutes
judge_runtime = Runtime(storage_path / "graphs" / "worker_health_judge")
judge_runtime = Runtime(storage_path / "graphs" / "judge")
monitoring_tools = list(monitoring_registry.get_tools().values())
monitoring_executor = monitoring_registry.get_executor()
@@ -272,7 +275,7 @@ class AgentManager:
tools=monitoring_tools,
tool_executor=monitoring_executor,
event_bus=event_bus,
stream_id="worker_health_judge",
stream_id="judge",
storage_path=judge_dir,
loop_config=judge_graph.loop_config,
)
@@ -282,7 +285,7 @@ class AgentManager:
input_data={
"event": {"source": "timer", "reason": "scheduled"},
},
session_state={"resume_session_id": "persistent"},
session_state={"resume_session_id": session_id},
)
except Exception:
logger.error("Health judge tick failed", exc_info=True)
@@ -300,6 +303,7 @@ class AgentManager:
worker_runtime=runtime,
event_bus=event_bus,
storage_path=storage_path,
session_id=session_id,
)
register_worker_monitoring_tools(
queen_registry,
@@ -365,7 +369,7 @@ class AgentManager:
graph=queen_graph,
goal=queen_goal,
input_data={"greeting": "Session started."},
session_state={"resume_session_id": "persistent"},
session_state={"resume_session_id": session_id},
)
logger.warning("Queen executor returned (should be forever-alive)")
except Exception:
+11 -1
View File
@@ -36,9 +36,14 @@ def register_queen_lifecycle_tools(
worker_runtime: AgentRuntime,
event_bus: EventBus,
storage_path: Path | None = None,
session_id: str | None = None,
) -> int:
"""Register queen lifecycle tools bound to *worker_runtime*.
Args:
session_id: Shared session ID so the worker uses the same session
scope as the queen and judge.
Returns the number of tools registered.
"""
from framework.llm.provider import Tool
@@ -55,7 +60,12 @@ def register_queen_lifecycle_tools(
"""
try:
# Get session state from any prior execution for memory continuity
session_state = worker_runtime._get_primary_session_state("default")
session_state = worker_runtime._get_primary_session_state("default") or {}
# Use the shared session ID so queen, judge, and worker all
# scope their conversations to the same session.
if session_id:
session_state["resume_session_id"] = session_id
exec_id = await worker_runtime.trigger(
entry_point_id="default",
+13 -9
View File
@@ -468,6 +468,8 @@ class AdenTUI(App):
into the worker runtime. The worker is completely untouched.
"""
import asyncio
import uuid
from datetime import datetime
from pathlib import Path
from framework.graph.executor import GraphExecutor
@@ -486,6 +488,10 @@ class AdenTUI(App):
llm = self.runtime._llm
agent_loop = self.chat_repl._agent_loop
# Generate a shared session ID for queen, judge, and worker.
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
session_id = f"session_{ts}_{uuid.uuid4().hex[:8]}"
# 1. Monitoring tools (health summary, emit ticket, notify operator).
# Registered on a standalone registry — NOT merged into the worker.
monitoring_registry = ToolRegistry()
@@ -496,11 +502,11 @@ class AdenTUI(App):
worker_graph_id=self.runtime._graph_id,
)
# 2. Storage dirs — under worker's base path but completely owned
# by the judge/queen. Worker never writes here.
judge_dir = storage_path / "graphs" / "judge" / "session"
# 2. Storage dirs — scoped by session_id so each agent load
# gets fresh queen/judge conversations.
judge_dir = storage_path / "graphs" / "judge" / "session" / session_id
judge_dir.mkdir(parents=True, exist_ok=True)
queen_dir = storage_path / "graphs" / "queen" / "session"
queen_dir = storage_path / "graphs" / "queen" / "session" / session_id
queen_dir.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------------
@@ -542,7 +548,7 @@ class AdenTUI(App):
input_data={
"event": {"source": "timer", "reason": "scheduled"},
},
session_state={"resume_session_id": "persistent"},
session_state={"resume_session_id": session_id},
)
except Exception:
log.error("Health judge tick failed", exc_info=True)
@@ -584,6 +590,7 @@ class AdenTUI(App):
worker_runtime=self.runtime,
event_bus=event_bus,
storage_path=storage_path,
session_id=session_id,
)
register_worker_monitoring_tools(
queen_registry,
@@ -596,9 +603,6 @@ class AdenTUI(App):
queen_tool_executor = queen_registry.get_executor()
# Build worker identity to inject into the queen's system prompt.
# This must be in the system prompt (not input_data) because
# persistent sessions restore the old conversation and skip
# _build_initial_message — the queen would lose context.
worker_graph_id = self.runtime._graph_id
worker_goal_name = getattr(self.runtime.goal, "name", worker_graph_id)
worker_goal_desc = getattr(self.runtime.goal, "description", "")
@@ -657,7 +661,7 @@ class AdenTUI(App):
graph=queen_graph,
goal=queen_goal,
input_data={"greeting": "Session started."},
session_state={"resume_session_id": "persistent"},
session_state={"resume_session_id": session_id},
)
# Should never reach here — queen is forever-alive.
log.warning(
+1 -1
View File
@@ -569,7 +569,7 @@ export default function Workspace() {
const streamId = event.stream_id;
// Suppress judge events (silent background monitoring)
if (streamId === "worker_health_judge") return;
if (streamId === "judge") return;
// Determine if this is a queen event
const isQueen = streamId === "queen";
+4 -18884
View File
File diff suppressed because it is too large Load Diff