3e6a34297d
Squashes 25 PR commits onto current main. AppConfig becomes a pure value object with no ambient lookup. Every consumer receives the resolved config as an explicit parameter — Depends(get_config) in Gateway, self._app_config in DeerFlowClient, runtime.context.app_config in agent runs, AppConfig.from_file() at the LangGraph Server registration boundary. Phase 1 — frozen data + typed context - All config models (AppConfig, MemoryConfig, DatabaseConfig, …) become frozen=True; no sub-module globals. - AppConfig.from_file() is pure (no side-effect singleton loaders). - Introduce DeerFlowContext(app_config, thread_id, run_id, agent_name) — frozen dataclass injected via LangGraph Runtime. - Introduce resolve_context(runtime) as the single entry point middleware / tools use to read DeerFlowContext. Phase 2 — pure explicit parameter passing - Gateway: app.state.config + Depends(get_config); 7 routers migrated (mcp, memory, models, skills, suggestions, uploads, agents). - DeerFlowClient: __init__(config=...) captures config locally. - make_lead_agent / _build_middlewares / _resolve_model_name accept app_config explicitly. - RunContext.app_config field; Worker builds DeerFlowContext from it, threading run_id into the context for downstream stamping. - Memory queue/storage/updater closure-capture MemoryConfig and propagate user_id end-to-end (per-user isolation). - Sandbox/skills/community/factories/tools thread app_config. - resolve_context() rejects non-typed runtime.context. - Test suite migrated off AppConfig.current() monkey-patches. - AppConfig.current() classmethod deleted. Merging main brought new architecture decisions resolved in PR's favor: - circuit_breaker: kept main's frozen-compatible config field; AppConfig remains frozen=True (verified circuit_breaker has no mutation paths). - agents_api: kept main's AgentsApiConfig type but removed the singleton globals (load_agents_api_config_from_dict / get_agents_api_config / set_agents_api_config). 8 routes in agents.py now read via Depends(get_config). - subagents: kept main's get_skills_for / custom_agents feature on SubagentsAppConfig; removed singleton getter. registry.py now reads app_config.subagents directly. - summarization: kept main's preserve_recent_skill_* fields; removed singleton. - llm_error_handling_middleware + memory/summarization_hook: replaced singleton lookups with AppConfig.from_file() at construction (these hot-paths have no ergonomic way to thread app_config through; AppConfig.from_file is a pure load). - worker.py + thread_data_middleware.py: DeerFlowContext.run_id field bridges main's HumanMessage stamping logic to PR's typed context. Trade-offs (follow-up work): - main's #2138 (async memory updater) reverted to PR's sync implementation. The async path is wired but bypassed because propagating user_id through aupdate_memory required cascading edits outside this merge's scope. - tests/test_subagent_skills_config.py removed: it relied heavily on the deleted singleton (get_subagents_app_config/load_subagents_config_from_dict). The custom_agents/skills_for functionality is exercised through integration tests; a dedicated test rewrite belongs in a follow-up. Verification: backend test suite — 2560 passed, 4 skipped, 84 failures. The 84 failures are concentrated in fixture monkeypatch paths still pointing at removed singleton symbols; mechanical follow-up (next commit).
670 lines
28 KiB
Python
670 lines
28 KiB
Python
"""Subagent execution engine."""
|
|
|
|
import asyncio
|
|
import logging
|
|
import threading
|
|
import uuid
|
|
from concurrent.futures import Future, ThreadPoolExecutor
|
|
from concurrent.futures import TimeoutError as FuturesTimeoutError
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
from langchain.agents import create_agent
|
|
from langchain.tools import BaseTool
|
|
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
|
|
from langchain_core.runnables import RunnableConfig
|
|
|
|
from deerflow.agents.thread_state import SandboxState, ThreadDataState, ThreadState
|
|
from deerflow.config.app_config import AppConfig
|
|
from deerflow.models import create_chat_model
|
|
from deerflow.subagents.config import SubagentConfig
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SubagentStatus(Enum):
|
|
"""Status of a subagent execution."""
|
|
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
CANCELLED = "cancelled"
|
|
TIMED_OUT = "timed_out"
|
|
|
|
|
|
@dataclass
|
|
class SubagentResult:
|
|
"""Result of a subagent execution.
|
|
|
|
Attributes:
|
|
task_id: Unique identifier for this execution.
|
|
trace_id: Trace ID for distributed tracing (links parent and subagent logs).
|
|
status: Current status of the execution.
|
|
result: The final result message (if completed).
|
|
error: Error message (if failed).
|
|
started_at: When execution started.
|
|
completed_at: When execution completed.
|
|
ai_messages: List of complete AI messages (as dicts) generated during execution.
|
|
"""
|
|
|
|
task_id: str
|
|
trace_id: str
|
|
status: SubagentStatus
|
|
result: str | None = None
|
|
error: str | None = None
|
|
started_at: datetime | None = None
|
|
completed_at: datetime | None = None
|
|
ai_messages: list[dict[str, Any]] | None = None
|
|
cancel_event: threading.Event = field(default_factory=threading.Event, repr=False)
|
|
|
|
def __post_init__(self):
|
|
"""Initialize mutable defaults."""
|
|
if self.ai_messages is None:
|
|
self.ai_messages = []
|
|
|
|
|
|
# Global storage for background task results
|
|
_background_tasks: dict[str, SubagentResult] = {}
|
|
_background_tasks_lock = threading.Lock()
|
|
|
|
# Thread pool for background task scheduling and orchestration
|
|
_scheduler_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-scheduler-")
|
|
|
|
# Thread pool for actual subagent execution (with timeout support)
|
|
# Larger pool to avoid blocking when scheduler submits execution tasks
|
|
_execution_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-exec-")
|
|
|
|
# Dedicated pool for sync execute() calls made from an already-running event loop.
|
|
_isolated_loop_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-isolated-")
|
|
|
|
|
|
def _filter_tools(
|
|
all_tools: list[BaseTool],
|
|
allowed: list[str] | None,
|
|
disallowed: list[str] | None,
|
|
) -> list[BaseTool]:
|
|
"""Filter tools based on subagent configuration.
|
|
|
|
Args:
|
|
all_tools: List of all available tools.
|
|
allowed: Optional allowlist of tool names. If provided, only these tools are included.
|
|
disallowed: Optional denylist of tool names. These tools are always excluded.
|
|
|
|
Returns:
|
|
Filtered list of tools.
|
|
"""
|
|
filtered = all_tools
|
|
|
|
# Apply allowlist if specified
|
|
if allowed is not None:
|
|
allowed_set = set(allowed)
|
|
filtered = [t for t in filtered if t.name in allowed_set]
|
|
|
|
# Apply denylist
|
|
if disallowed is not None:
|
|
disallowed_set = set(disallowed)
|
|
filtered = [t for t in filtered if t.name not in disallowed_set]
|
|
|
|
return filtered
|
|
|
|
|
|
def _get_model_name(config: SubagentConfig, parent_model: str | None) -> str | None:
|
|
"""Resolve the model name for a subagent.
|
|
|
|
Args:
|
|
config: Subagent configuration.
|
|
parent_model: The parent agent's model name.
|
|
|
|
Returns:
|
|
Model name to use, or None to use default.
|
|
"""
|
|
if config.model == "inherit":
|
|
return parent_model
|
|
return config.model
|
|
|
|
|
|
class SubagentExecutor:
|
|
"""Executor for running subagents."""
|
|
|
|
def __init__(
|
|
self,
|
|
config: SubagentConfig,
|
|
tools: list[BaseTool],
|
|
app_config: AppConfig,
|
|
parent_model: str | None = None,
|
|
sandbox_state: SandboxState | None = None,
|
|
thread_data: ThreadDataState | None = None,
|
|
thread_id: str | None = None,
|
|
trace_id: str | None = None,
|
|
):
|
|
"""Initialize the executor."""
|
|
self.config = config
|
|
self.app_config = app_config
|
|
self.parent_model = parent_model
|
|
self.sandbox_state = sandbox_state
|
|
self.thread_data = thread_data
|
|
self.thread_id = thread_id
|
|
# Generate trace_id if not provided (for top-level calls)
|
|
self.trace_id = trace_id or str(uuid.uuid4())[:8]
|
|
|
|
# Filter tools based on config
|
|
self.tools = _filter_tools(
|
|
tools,
|
|
config.tools,
|
|
config.disallowed_tools,
|
|
)
|
|
|
|
logger.info(f"[trace={self.trace_id}] SubagentExecutor initialized: {config.name} with {len(self.tools)} tools")
|
|
|
|
def _create_agent(self):
|
|
"""Create the agent instance."""
|
|
model_name = _get_model_name(self.config, self.parent_model)
|
|
model = create_chat_model(name=model_name, thinking_enabled=False, app_config=self.app_config)
|
|
|
|
from deerflow.agents.middlewares.tool_error_handling_middleware import build_subagent_runtime_middlewares
|
|
|
|
# Reuse shared middleware composition with lead agent.
|
|
middlewares = build_subagent_runtime_middlewares(lazy_init=True)
|
|
|
|
return create_agent(
|
|
model=model,
|
|
tools=self.tools,
|
|
middleware=middlewares,
|
|
system_prompt=self.config.system_prompt,
|
|
state_schema=ThreadState,
|
|
)
|
|
|
|
async def _load_skill_messages(self) -> list[SystemMessage]:
|
|
"""Load skill content as conversation items based on config.skills.
|
|
|
|
Aligned with Codex's pattern: each subagent loads its own skills
|
|
per-session and injects them as conversation items (developer messages),
|
|
not as system prompt text. The config.skills whitelist controls which
|
|
skills are loaded:
|
|
- None: load all enabled skills
|
|
- []: no skills
|
|
- ["skill-a", "skill-b"]: only these skills
|
|
|
|
Returns:
|
|
List of SystemMessages containing skill content.
|
|
"""
|
|
if self.config.skills is not None and len(self.config.skills) == 0:
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} skills=[] — skipping skill loading")
|
|
return []
|
|
|
|
try:
|
|
from deerflow.skills.loader import load_skills
|
|
|
|
# Use asyncio.to_thread to avoid blocking the event loop (LangGraph ASGI requirement)
|
|
all_skills = await asyncio.to_thread(load_skills, enabled_only=True)
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} loaded {len(all_skills)} enabled skills from disk")
|
|
except Exception:
|
|
logger.warning(f"[trace={self.trace_id}] Failed to load skills for subagent {self.config.name}", exc_info=True)
|
|
return []
|
|
|
|
if not all_skills:
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} no enabled skills found")
|
|
return []
|
|
|
|
# Filter by config.skills whitelist
|
|
if self.config.skills is not None:
|
|
allowed = set(self.config.skills)
|
|
skills = [s for s in all_skills if s.name in allowed]
|
|
else:
|
|
skills = all_skills
|
|
|
|
if not skills:
|
|
return []
|
|
|
|
# Read each skill's SKILL.md content and create conversation items
|
|
messages = []
|
|
for skill in skills:
|
|
try:
|
|
content = await asyncio.to_thread(skill.skill_file.read_text, encoding="utf-8")
|
|
content = content.strip()
|
|
if content:
|
|
messages.append(SystemMessage(content=f'<skill name="{skill.name}">\n{content}\n</skill>'))
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} loaded skill: {skill.name}")
|
|
except Exception:
|
|
logger.debug(f"[trace={self.trace_id}] Failed to read skill {skill.name}", exc_info=True)
|
|
|
|
return messages
|
|
|
|
async def _build_initial_state(self, task: str) -> dict[str, Any]:
|
|
"""Build the initial state for agent execution.
|
|
|
|
Args:
|
|
task: The task description.
|
|
|
|
Returns:
|
|
Initial state dictionary.
|
|
"""
|
|
# Load skills as conversation items (Codex pattern)
|
|
skill_messages = await self._load_skill_messages()
|
|
|
|
messages: list = []
|
|
# Skill content injected as developer/system messages before the task
|
|
messages.extend(skill_messages)
|
|
# Then the actual task
|
|
messages.append(HumanMessage(content=task))
|
|
|
|
state: dict[str, Any] = {
|
|
"messages": messages,
|
|
}
|
|
|
|
# Pass through sandbox and thread data from parent
|
|
if self.sandbox_state is not None:
|
|
state["sandbox"] = self.sandbox_state
|
|
if self.thread_data is not None:
|
|
state["thread_data"] = self.thread_data
|
|
|
|
return state
|
|
|
|
async def _aexecute(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult:
|
|
"""Execute a task asynchronously.
|
|
|
|
Args:
|
|
task: The task description for the subagent.
|
|
result_holder: Optional pre-created result object to update during execution.
|
|
|
|
Returns:
|
|
SubagentResult with the execution result.
|
|
"""
|
|
if result_holder is not None:
|
|
# Use the provided result holder (for async execution with real-time updates)
|
|
result = result_holder
|
|
else:
|
|
# Create a new result for synchronous execution
|
|
task_id = str(uuid.uuid4())[:8]
|
|
result = SubagentResult(
|
|
task_id=task_id,
|
|
trace_id=self.trace_id,
|
|
status=SubagentStatus.RUNNING,
|
|
started_at=datetime.now(),
|
|
)
|
|
|
|
try:
|
|
agent = self._create_agent()
|
|
state = await self._build_initial_state(task)
|
|
|
|
# Build config with thread_id for sandbox access and recursion limit
|
|
run_config: RunnableConfig = {
|
|
"recursion_limit": self.config.max_turns,
|
|
}
|
|
context = {}
|
|
if self.thread_id:
|
|
run_config["configurable"] = {"thread_id": self.thread_id}
|
|
context["thread_id"] = self.thread_id
|
|
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution with max_turns={self.config.max_turns}")
|
|
|
|
# Use stream instead of invoke to get real-time updates
|
|
# This allows us to collect AI messages as they are generated
|
|
final_state = None
|
|
|
|
# Pre-check: bail out immediately if already cancelled before streaming starts
|
|
if result.cancel_event.is_set():
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} cancelled before streaming")
|
|
with _background_tasks_lock:
|
|
if result.status == SubagentStatus.RUNNING:
|
|
result.status = SubagentStatus.CANCELLED
|
|
result.error = "Cancelled by user"
|
|
result.completed_at = datetime.now()
|
|
return result
|
|
|
|
async for chunk in agent.astream(state, config=run_config, context=context, stream_mode="values"): # type: ignore[arg-type]
|
|
# Cooperative cancellation: check if parent requested stop.
|
|
# Note: cancellation is only detected at astream iteration boundaries,
|
|
# so long-running tool calls within a single iteration will not be
|
|
# interrupted until the next chunk is yielded.
|
|
if result.cancel_event.is_set():
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} cancelled by parent")
|
|
with _background_tasks_lock:
|
|
if result.status == SubagentStatus.RUNNING:
|
|
result.status = SubagentStatus.CANCELLED
|
|
result.error = "Cancelled by user"
|
|
result.completed_at = datetime.now()
|
|
return result
|
|
|
|
final_state = chunk
|
|
|
|
# Extract AI messages from the current state
|
|
messages = chunk.get("messages", [])
|
|
if messages:
|
|
last_message = messages[-1]
|
|
# Check if this is a new AI message
|
|
if isinstance(last_message, AIMessage):
|
|
# Convert message to dict for serialization
|
|
message_dict = last_message.model_dump()
|
|
# Only add if it's not already in the list (avoid duplicates)
|
|
# Check by comparing message IDs if available, otherwise compare full dict
|
|
message_id = message_dict.get("id")
|
|
is_duplicate = False
|
|
if message_id:
|
|
is_duplicate = any(msg.get("id") == message_id for msg in result.ai_messages)
|
|
else:
|
|
is_duplicate = message_dict in result.ai_messages
|
|
|
|
if not is_duplicate:
|
|
result.ai_messages.append(message_dict)
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} captured AI message #{len(result.ai_messages)}")
|
|
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} completed async execution")
|
|
|
|
if final_state is None:
|
|
logger.warning(f"[trace={self.trace_id}] Subagent {self.config.name} no final state")
|
|
result.result = "No response generated"
|
|
else:
|
|
# Extract the final message - find the last AIMessage
|
|
messages = final_state.get("messages", [])
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} final messages count: {len(messages)}")
|
|
|
|
# Find the last AIMessage in the conversation
|
|
last_ai_message = None
|
|
for msg in reversed(messages):
|
|
if isinstance(msg, AIMessage):
|
|
last_ai_message = msg
|
|
break
|
|
|
|
if last_ai_message is not None:
|
|
content = last_ai_message.content
|
|
# Handle both str and list content types for the final result
|
|
if isinstance(content, str):
|
|
result.result = content
|
|
elif isinstance(content, list):
|
|
# Extract text from list of content blocks for final result only.
|
|
# Concatenate raw string chunks directly, but preserve separation
|
|
# between full text blocks for readability.
|
|
text_parts = []
|
|
pending_str_parts = []
|
|
for block in content:
|
|
if isinstance(block, str):
|
|
pending_str_parts.append(block)
|
|
elif isinstance(block, dict):
|
|
if pending_str_parts:
|
|
text_parts.append("".join(pending_str_parts))
|
|
pending_str_parts.clear()
|
|
text_val = block.get("text")
|
|
if isinstance(text_val, str):
|
|
text_parts.append(text_val)
|
|
if pending_str_parts:
|
|
text_parts.append("".join(pending_str_parts))
|
|
result.result = "\n".join(text_parts) if text_parts else "No text content in response"
|
|
else:
|
|
result.result = str(content)
|
|
elif messages:
|
|
# Fallback: use the last message if no AIMessage found
|
|
last_message = messages[-1]
|
|
logger.warning(f"[trace={self.trace_id}] Subagent {self.config.name} no AIMessage found, using last message: {type(last_message)}")
|
|
raw_content = last_message.content if hasattr(last_message, "content") else str(last_message)
|
|
if isinstance(raw_content, str):
|
|
result.result = raw_content
|
|
elif isinstance(raw_content, list):
|
|
parts = []
|
|
pending_str_parts = []
|
|
for block in raw_content:
|
|
if isinstance(block, str):
|
|
pending_str_parts.append(block)
|
|
elif isinstance(block, dict):
|
|
if pending_str_parts:
|
|
parts.append("".join(pending_str_parts))
|
|
pending_str_parts.clear()
|
|
text_val = block.get("text")
|
|
if isinstance(text_val, str):
|
|
parts.append(text_val)
|
|
if pending_str_parts:
|
|
parts.append("".join(pending_str_parts))
|
|
result.result = "\n".join(parts) if parts else "No text content in response"
|
|
else:
|
|
result.result = str(raw_content)
|
|
else:
|
|
logger.warning(f"[trace={self.trace_id}] Subagent {self.config.name} no messages in final state")
|
|
result.result = "No response generated"
|
|
|
|
result.status = SubagentStatus.COMPLETED
|
|
result.completed_at = datetime.now()
|
|
|
|
except Exception as e:
|
|
logger.exception(f"[trace={self.trace_id}] Subagent {self.config.name} async execution failed")
|
|
result.status = SubagentStatus.FAILED
|
|
result.error = str(e)
|
|
result.completed_at = datetime.now()
|
|
|
|
return result
|
|
|
|
def _execute_in_isolated_loop(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult:
|
|
"""Execute the subagent in a completely fresh event loop.
|
|
|
|
This method is designed to run in a separate thread to ensure complete
|
|
isolation from any parent event loop, preventing conflicts with asyncio
|
|
primitives that may be bound to the parent loop (e.g., httpx clients).
|
|
"""
|
|
try:
|
|
previous_loop = asyncio.get_event_loop()
|
|
except RuntimeError:
|
|
previous_loop = None
|
|
|
|
# Create and set a new event loop for this thread
|
|
loop = asyncio.new_event_loop()
|
|
try:
|
|
asyncio.set_event_loop(loop)
|
|
return loop.run_until_complete(self._aexecute(task, result_holder))
|
|
finally:
|
|
try:
|
|
pending = asyncio.all_tasks(loop)
|
|
if pending:
|
|
for task_obj in pending:
|
|
task_obj.cancel()
|
|
loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
|
|
|
|
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
loop.run_until_complete(loop.shutdown_default_executor())
|
|
except Exception:
|
|
logger.debug(
|
|
f"[trace={self.trace_id}] Failed while cleaning up isolated event loop for subagent {self.config.name}",
|
|
exc_info=True,
|
|
)
|
|
finally:
|
|
try:
|
|
loop.close()
|
|
finally:
|
|
asyncio.set_event_loop(previous_loop)
|
|
|
|
def execute(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult:
|
|
"""Execute a task synchronously (wrapper around async execution).
|
|
|
|
This method runs the async execution in a new event loop, allowing
|
|
asynchronous tools (like MCP tools) to be used within the thread pool.
|
|
|
|
When called from within an already-running event loop (e.g., when the
|
|
parent agent is async), this method isolates the subagent execution in
|
|
a separate thread to avoid event loop conflicts with shared async
|
|
primitives like httpx clients.
|
|
|
|
Args:
|
|
task: The task description for the subagent.
|
|
result_holder: Optional pre-created result object to update during execution.
|
|
|
|
Returns:
|
|
SubagentResult with the execution result.
|
|
"""
|
|
try:
|
|
try:
|
|
loop = asyncio.get_running_loop()
|
|
except RuntimeError:
|
|
loop = None
|
|
|
|
if loop is not None and loop.is_running():
|
|
logger.debug(f"[trace={self.trace_id}] Subagent {self.config.name} detected running event loop, using isolated thread")
|
|
future = _isolated_loop_pool.submit(self._execute_in_isolated_loop, task, result_holder)
|
|
return future.result()
|
|
|
|
# Standard path: no running event loop, use asyncio.run
|
|
return asyncio.run(self._aexecute(task, result_holder))
|
|
except Exception as e:
|
|
logger.exception(f"[trace={self.trace_id}] Subagent {self.config.name} execution failed")
|
|
# Create a result with error if we don't have one
|
|
if result_holder is not None:
|
|
result = result_holder
|
|
else:
|
|
result = SubagentResult(
|
|
task_id=str(uuid.uuid4())[:8],
|
|
trace_id=self.trace_id,
|
|
status=SubagentStatus.FAILED,
|
|
)
|
|
result.status = SubagentStatus.FAILED
|
|
result.error = str(e)
|
|
result.completed_at = datetime.now()
|
|
return result
|
|
|
|
def execute_async(self, task: str, task_id: str | None = None) -> str:
|
|
"""Start a task execution in the background.
|
|
|
|
Args:
|
|
task: The task description for the subagent.
|
|
task_id: Optional task ID to use. If not provided, a random UUID will be generated.
|
|
|
|
Returns:
|
|
Task ID that can be used to check status later.
|
|
"""
|
|
# Use provided task_id or generate a new one
|
|
if task_id is None:
|
|
task_id = str(uuid.uuid4())[:8]
|
|
|
|
# Create initial pending result
|
|
result = SubagentResult(
|
|
task_id=task_id,
|
|
trace_id=self.trace_id,
|
|
status=SubagentStatus.PENDING,
|
|
)
|
|
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution, task_id={task_id}, timeout={self.config.timeout_seconds}s")
|
|
|
|
with _background_tasks_lock:
|
|
_background_tasks[task_id] = result
|
|
|
|
# Submit to scheduler pool
|
|
def run_task():
|
|
with _background_tasks_lock:
|
|
_background_tasks[task_id].status = SubagentStatus.RUNNING
|
|
_background_tasks[task_id].started_at = datetime.now()
|
|
result_holder = _background_tasks[task_id]
|
|
|
|
try:
|
|
# Submit execution to execution pool with timeout
|
|
# Pass result_holder so execute() can update it in real-time
|
|
execution_future: Future = _execution_pool.submit(self.execute, task, result_holder)
|
|
try:
|
|
# Wait for execution with timeout
|
|
exec_result = execution_future.result(timeout=self.config.timeout_seconds)
|
|
with _background_tasks_lock:
|
|
_background_tasks[task_id].status = exec_result.status
|
|
_background_tasks[task_id].result = exec_result.result
|
|
_background_tasks[task_id].error = exec_result.error
|
|
_background_tasks[task_id].completed_at = datetime.now()
|
|
_background_tasks[task_id].ai_messages = exec_result.ai_messages
|
|
except FuturesTimeoutError:
|
|
logger.error(f"[trace={self.trace_id}] Subagent {self.config.name} execution timed out after {self.config.timeout_seconds}s")
|
|
with _background_tasks_lock:
|
|
if _background_tasks[task_id].status == SubagentStatus.RUNNING:
|
|
_background_tasks[task_id].status = SubagentStatus.TIMED_OUT
|
|
_background_tasks[task_id].error = f"Execution timed out after {self.config.timeout_seconds} seconds"
|
|
_background_tasks[task_id].completed_at = datetime.now()
|
|
# Signal cooperative cancellation and cancel the future
|
|
result_holder.cancel_event.set()
|
|
execution_future.cancel()
|
|
except Exception as e:
|
|
logger.exception(f"[trace={self.trace_id}] Subagent {self.config.name} async execution failed")
|
|
with _background_tasks_lock:
|
|
_background_tasks[task_id].status = SubagentStatus.FAILED
|
|
_background_tasks[task_id].error = str(e)
|
|
_background_tasks[task_id].completed_at = datetime.now()
|
|
|
|
_scheduler_pool.submit(run_task)
|
|
return task_id
|
|
|
|
|
|
MAX_CONCURRENT_SUBAGENTS = 3
|
|
|
|
|
|
def request_cancel_background_task(task_id: str) -> None:
|
|
"""Signal a running background task to stop.
|
|
|
|
Sets the cancel_event on the task, which is checked cooperatively
|
|
by ``_aexecute`` during ``agent.astream()`` iteration. This allows
|
|
subagent threads — which cannot be force-killed via ``Future.cancel()``
|
|
— to stop at the next iteration boundary.
|
|
|
|
Args:
|
|
task_id: The task ID to cancel.
|
|
"""
|
|
with _background_tasks_lock:
|
|
result = _background_tasks.get(task_id)
|
|
if result is not None:
|
|
result.cancel_event.set()
|
|
logger.info("Requested cancellation for background task %s", task_id)
|
|
|
|
|
|
def get_background_task_result(task_id: str) -> SubagentResult | None:
|
|
"""Get the result of a background task.
|
|
|
|
Args:
|
|
task_id: The task ID returned by execute_async.
|
|
|
|
Returns:
|
|
SubagentResult if found, None otherwise.
|
|
"""
|
|
with _background_tasks_lock:
|
|
return _background_tasks.get(task_id)
|
|
|
|
|
|
def list_background_tasks() -> list[SubagentResult]:
|
|
"""List all background tasks.
|
|
|
|
Returns:
|
|
List of all SubagentResult instances.
|
|
"""
|
|
with _background_tasks_lock:
|
|
return list(_background_tasks.values())
|
|
|
|
|
|
def cleanup_background_task(task_id: str) -> None:
|
|
"""Remove a completed task from background tasks.
|
|
|
|
Should be called by task_tool after it finishes polling and returns the result.
|
|
This prevents memory leaks from accumulated completed tasks.
|
|
|
|
Only removes tasks that are in a terminal state (COMPLETED/FAILED/TIMED_OUT)
|
|
to avoid race conditions with the background executor still updating the task entry.
|
|
|
|
Args:
|
|
task_id: The task ID to remove.
|
|
"""
|
|
with _background_tasks_lock:
|
|
result = _background_tasks.get(task_id)
|
|
if result is None:
|
|
# Nothing to clean up; may have been removed already.
|
|
logger.debug("Requested cleanup for unknown background task %s", task_id)
|
|
return
|
|
|
|
# Only clean up tasks that are in a terminal state to avoid races with
|
|
# the background executor still updating the task entry.
|
|
is_terminal_status = result.status in {
|
|
SubagentStatus.COMPLETED,
|
|
SubagentStatus.FAILED,
|
|
SubagentStatus.CANCELLED,
|
|
SubagentStatus.TIMED_OUT,
|
|
}
|
|
if is_terminal_status or result.completed_at is not None:
|
|
del _background_tasks[task_id]
|
|
logger.debug("Cleaned up background task: %s", task_id)
|
|
else:
|
|
logger.debug(
|
|
"Skipping cleanup for non-terminal background task %s (status=%s)",
|
|
task_id,
|
|
result.status.value if hasattr(result.status, "value") else result.status,
|
|
)
|