8ba01dfd83
* refactor: thread app config through lead prompt * fix: honor explicit app config across runtime paths * style: format subagent executor tests * fix: thread resolved app config and guard subagents-only fallback Address two PR review findings: 1. _create_summarization_middleware passed the original (possibly None) app_config into create_chat_model, forcing the model factory back to ambient get_app_config() and risking config drift between the middleware's resolved view and the model's view. Pass the resolved AppConfig instance through end-to-end. 2. get_available_subagent_names accepted Any-typed config and forwarded it to is_host_bash_allowed, which reads ``.sandbox``. A SubagentsAppConfig (also accepted upstream as a sum-type input) has no ``.sandbox`` attribute and would be silently treated as "no sandbox configured", incorrectly disabling the bash subagent. Guard on hasattr and fall back to ambient lookup otherwise. Adds regression tests for both paths. * chore: simplify hasattr guard and tighten regression tests - Collapse if/else into ternary in get_available_subagent_names; hasattr(None, ...) is False so the explicit None check was redundant. - Drop comments that narrate the change rather than explain non-obvious WHY (test names already convey intent). - Replace stringly-typed sentinel "no-arg" in regression test with direct args tuple comparison. --------- Co-authored-by: greatmengqi <chenmengqi.0376@bytedance.com>
792 lines
33 KiB
Python
792 lines
33 KiB
Python
"""Subagent execution engine."""
|
|
|
|
import asyncio
|
|
import atexit
|
|
import logging
|
|
import threading
|
|
import uuid
|
|
from collections.abc import Callable, Coroutine
|
|
from concurrent.futures import Future, ThreadPoolExecutor
|
|
from concurrent.futures import TimeoutError as FuturesTimeoutError
|
|
from contextvars import Context, copy_context
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
from langchain.agents import create_agent
|
|
from langchain.tools import BaseTool
|
|
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
|
|
from langchain_core.runnables import RunnableConfig
|
|
|
|
from deerflow.agents.thread_state import SandboxState, ThreadDataState, ThreadState
|
|
from deerflow.config import get_app_config
|
|
from deerflow.config.app_config import AppConfig
|
|
from deerflow.models import create_chat_model
|
|
from deerflow.subagents.config import SubagentConfig, resolve_subagent_model_name
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
_previous_shutdown_isolated_subagent_loop = globals().get("_shutdown_isolated_subagent_loop")
|
|
if callable(_previous_shutdown_isolated_subagent_loop):
|
|
atexit.unregister(_previous_shutdown_isolated_subagent_loop)
|
|
_previous_shutdown_isolated_subagent_loop()
|
|
|
|
|
|
class SubagentStatus(Enum):
|
|
"""Status of a subagent execution."""
|
|
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
CANCELLED = "cancelled"
|
|
TIMED_OUT = "timed_out"
|
|
|
|
|
|
@dataclass
|
|
class SubagentResult:
|
|
"""Result of a subagent execution.
|
|
|
|
Attributes:
|
|
task_id: Unique identifier for this execution.
|
|
trace_id: Trace ID for distributed tracing (links parent and subagent logs).
|
|
status: Current status of the execution.
|
|
result: The final result message (if completed).
|
|
error: Error message (if failed).
|
|
started_at: When execution started.
|
|
completed_at: When execution completed.
|
|
ai_messages: List of complete AI messages (as dicts) generated during execution.
|
|
"""
|
|
|
|
task_id: str
|
|
trace_id: str
|
|
status: SubagentStatus
|
|
result: str | None = None
|
|
error: str | None = None
|
|
started_at: datetime | None = None
|
|
completed_at: datetime | None = None
|
|
ai_messages: list[dict[str, Any]] | None = None
|
|
cancel_event: threading.Event = field(default_factory=threading.Event, repr=False)
|
|
|
|
def __post_init__(self):
|
|
"""Initialize mutable defaults."""
|
|
if self.ai_messages is None:
|
|
self.ai_messages = []
|
|
|
|
|
|
# Global storage for background task results
|
|
_background_tasks: dict[str, SubagentResult] = {}
|
|
_background_tasks_lock = threading.Lock()
|
|
|
|
# Thread pool for background task scheduling and orchestration
|
|
_scheduler_pool = ThreadPoolExecutor(max_workers=3, thread_name_prefix="subagent-scheduler-")
|
|
|
|
# Persistent event loop for isolated subagent executions triggered from an
|
|
# already-running parent loop. Reusing one long-lived loop avoids creating a
|
|
# fresh loop per execution and then closing async resources bound to it.
|
|
_isolated_subagent_loop: asyncio.AbstractEventLoop | None = None
|
|
_isolated_subagent_loop_thread: threading.Thread | None = None
|
|
_isolated_subagent_loop_started: threading.Event | None = None
|
|
_isolated_subagent_loop_lock = threading.Lock()
|
|
|
|
|
|
def _run_isolated_subagent_loop(
|
|
loop: asyncio.AbstractEventLoop,
|
|
started_event: threading.Event,
|
|
) -> None:
|
|
"""Run the persistent isolated subagent loop in a dedicated daemon thread."""
|
|
asyncio.set_event_loop(loop)
|
|
loop.call_soon(started_event.set)
|
|
try:
|
|
loop.run_forever()
|
|
finally:
|
|
started_event.clear()
|
|
|
|
|
|
def _shutdown_isolated_subagent_loop() -> None:
|
|
"""Stop and close the persistent isolated subagent loop."""
|
|
global _isolated_subagent_loop, _isolated_subagent_loop_thread, _isolated_subagent_loop_started
|
|
|
|
with _isolated_subagent_loop_lock:
|
|
loop = _isolated_subagent_loop
|
|
thread = _isolated_subagent_loop_thread
|
|
_isolated_subagent_loop = None
|
|
_isolated_subagent_loop_thread = None
|
|
_isolated_subagent_loop_started = None
|
|
|
|
if loop is None:
|
|
return
|
|
|
|
if loop.is_running():
|
|
loop.call_soon_threadsafe(loop.stop)
|
|
|
|
if thread is not None and thread.is_alive() and thread is not threading.current_thread():
|
|
thread.join(timeout=1)
|
|
|
|
thread_stopped = thread is None or not thread.is_alive()
|
|
loop_stopped = not loop.is_running()
|
|
|
|
if not loop.is_closed():
|
|
if thread_stopped and loop_stopped:
|
|
loop.close()
|
|
else:
|
|
logger.warning(
|
|
"Skipping close of isolated subagent loop because shutdown did not complete within timeout (thread_alive=%s, loop_running=%s)",
|
|
thread is not None and thread.is_alive(),
|
|
loop.is_running(),
|
|
)
|
|
|
|
|
|
atexit.register(_shutdown_isolated_subagent_loop)
|
|
|
|
|
|
def _get_isolated_subagent_loop() -> asyncio.AbstractEventLoop:
|
|
"""Return the persistent event loop used by isolated subagent executions."""
|
|
global _isolated_subagent_loop, _isolated_subagent_loop_thread, _isolated_subagent_loop_started
|
|
with _isolated_subagent_loop_lock:
|
|
thread_is_alive = _isolated_subagent_loop_thread is not None and _isolated_subagent_loop_thread.is_alive()
|
|
loop_is_usable = _isolated_subagent_loop is not None and not _isolated_subagent_loop.is_closed() and _isolated_subagent_loop.is_running() and thread_is_alive
|
|
|
|
if not loop_is_usable:
|
|
loop = asyncio.new_event_loop()
|
|
started_event = threading.Event()
|
|
thread = threading.Thread(
|
|
target=_run_isolated_subagent_loop,
|
|
args=(loop, started_event),
|
|
name="subagent-persistent-loop",
|
|
daemon=True,
|
|
)
|
|
thread.start()
|
|
if not started_event.wait(timeout=5):
|
|
loop.call_soon_threadsafe(loop.stop)
|
|
thread.join(timeout=1)
|
|
loop.close()
|
|
raise RuntimeError("Timed out starting isolated subagent event loop")
|
|
_isolated_subagent_loop = loop
|
|
_isolated_subagent_loop_thread = thread
|
|
_isolated_subagent_loop_started = started_event
|
|
|
|
if _isolated_subagent_loop is None:
|
|
raise RuntimeError("Isolated subagent event loop is not initialized")
|
|
return _isolated_subagent_loop
|
|
|
|
|
|
def _submit_to_isolated_loop_in_context(
|
|
context: Context,
|
|
coro_factory: Callable[[], Coroutine[Any, Any, SubagentResult]],
|
|
) -> Future[SubagentResult]:
|
|
"""Submit a coroutine to the isolated loop while preserving ContextVar state."""
|
|
return context.run(
|
|
lambda: asyncio.run_coroutine_threadsafe(
|
|
coro_factory(),
|
|
_get_isolated_subagent_loop(),
|
|
)
|
|
)
|
|
|
|
|
|
def _filter_tools(
|
|
all_tools: list[BaseTool],
|
|
allowed: list[str] | None,
|
|
disallowed: list[str] | None,
|
|
) -> list[BaseTool]:
|
|
"""Filter tools based on subagent configuration.
|
|
|
|
Args:
|
|
all_tools: List of all available tools.
|
|
allowed: Optional allowlist of tool names. If provided, only these tools are included.
|
|
disallowed: Optional denylist of tool names. These tools are always excluded.
|
|
|
|
Returns:
|
|
Filtered list of tools.
|
|
"""
|
|
filtered = all_tools
|
|
|
|
# Apply allowlist if specified
|
|
if allowed is not None:
|
|
allowed_set = set(allowed)
|
|
filtered = [t for t in filtered if t.name in allowed_set]
|
|
|
|
# Apply denylist
|
|
if disallowed is not None:
|
|
disallowed_set = set(disallowed)
|
|
filtered = [t for t in filtered if t.name not in disallowed_set]
|
|
|
|
return filtered
|
|
|
|
|
|
class SubagentExecutor:
|
|
"""Executor for running subagents."""
|
|
|
|
def __init__(
|
|
self,
|
|
config: SubagentConfig,
|
|
tools: list[BaseTool],
|
|
app_config: AppConfig | None = None,
|
|
parent_model: str | None = None,
|
|
sandbox_state: SandboxState | None = None,
|
|
thread_data: ThreadDataState | None = None,
|
|
thread_id: str | None = None,
|
|
trace_id: str | None = None,
|
|
):
|
|
"""Initialize the executor.
|
|
|
|
Args:
|
|
config: Subagent configuration.
|
|
tools: List of all available tools (will be filtered).
|
|
app_config: Resolved AppConfig. When None, ``_create_agent`` falls
|
|
back to ``get_app_config()`` (matches the lead-agent factory's
|
|
pattern).
|
|
parent_model: The parent agent's model name for inheritance.
|
|
sandbox_state: Sandbox state from parent agent.
|
|
thread_data: Thread data from parent agent.
|
|
thread_id: Thread ID for sandbox operations.
|
|
trace_id: Trace ID from parent for distributed tracing.
|
|
"""
|
|
self.config = config
|
|
self.app_config = app_config
|
|
self.parent_model = parent_model
|
|
# Resolve eagerly only when it does not require loading config.yaml; otherwise defer
|
|
# to _create_agent (which already loads app_config) so unit tests can construct
|
|
# executors without a config file present.
|
|
if config.model != "inherit" or parent_model is not None or app_config is not None:
|
|
self.model_name: str | None = resolve_subagent_model_name(config, parent_model, app_config=app_config)
|
|
else:
|
|
self.model_name = None
|
|
self.sandbox_state = sandbox_state
|
|
self.thread_data = thread_data
|
|
self.thread_id = thread_id
|
|
# Generate trace_id if not provided (for top-level calls)
|
|
self.trace_id = trace_id or str(uuid.uuid4())[:8]
|
|
|
|
# Filter tools based on config
|
|
self.tools = _filter_tools(
|
|
tools,
|
|
config.tools,
|
|
config.disallowed_tools,
|
|
)
|
|
|
|
logger.info(f"[trace={self.trace_id}] SubagentExecutor initialized: {config.name} with {len(self.tools)} tools")
|
|
|
|
def _create_agent(self):
|
|
"""Create the agent instance."""
|
|
app_config = self.app_config or get_app_config()
|
|
if self.model_name is None:
|
|
self.model_name = resolve_subagent_model_name(self.config, self.parent_model, app_config=app_config)
|
|
model = create_chat_model(name=self.model_name, thinking_enabled=False, app_config=app_config)
|
|
|
|
from deerflow.agents.middlewares.tool_error_handling_middleware import build_subagent_runtime_middlewares
|
|
|
|
# Reuse shared middleware composition with lead agent.
|
|
middlewares = build_subagent_runtime_middlewares(app_config=app_config, model_name=self.model_name, lazy_init=True)
|
|
|
|
return create_agent(
|
|
model=model,
|
|
tools=self.tools,
|
|
middleware=middlewares,
|
|
system_prompt=self.config.system_prompt,
|
|
state_schema=ThreadState,
|
|
)
|
|
|
|
async def _load_skill_messages(self) -> list[SystemMessage]:
|
|
"""Load skill content as conversation items based on config.skills.
|
|
|
|
Aligned with Codex's pattern: each subagent loads its own skills
|
|
per-session and injects them as conversation items (developer messages),
|
|
not as system prompt text. The config.skills whitelist controls which
|
|
skills are loaded:
|
|
- None: load all enabled skills
|
|
- []: no skills
|
|
- ["skill-a", "skill-b"]: only these skills
|
|
|
|
Returns:
|
|
List of SystemMessages containing skill content.
|
|
"""
|
|
if self.config.skills is not None and len(self.config.skills) == 0:
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} skills=[] — skipping skill loading")
|
|
return []
|
|
|
|
try:
|
|
from deerflow.skills.storage import get_or_new_skill_storage
|
|
|
|
storage_kwargs = {"app_config": self.app_config} if self.app_config is not None else {}
|
|
storage = await asyncio.to_thread(get_or_new_skill_storage, **storage_kwargs)
|
|
# Use asyncio.to_thread to avoid blocking the event loop (LangGraph ASGI requirement)
|
|
all_skills = await asyncio.to_thread(storage.load_skills, enabled_only=True)
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} loaded {len(all_skills)} enabled skills from disk")
|
|
except Exception:
|
|
logger.warning(f"[trace={self.trace_id}] Failed to load skills for subagent {self.config.name}", exc_info=True)
|
|
return []
|
|
|
|
if not all_skills:
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} no enabled skills found")
|
|
return []
|
|
|
|
# Filter by config.skills whitelist
|
|
if self.config.skills is not None:
|
|
allowed = set(self.config.skills)
|
|
skills = [s for s in all_skills if s.name in allowed]
|
|
else:
|
|
skills = all_skills
|
|
|
|
if not skills:
|
|
return []
|
|
|
|
# Read each skill's SKILL.md content and create conversation items
|
|
messages = []
|
|
for skill in skills:
|
|
try:
|
|
content = await asyncio.to_thread(skill.skill_file.read_text, encoding="utf-8")
|
|
content = content.strip()
|
|
if content:
|
|
messages.append(SystemMessage(content=f'<skill name="{skill.name}">\n{content}\n</skill>'))
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} loaded skill: {skill.name}")
|
|
except Exception:
|
|
logger.debug(f"[trace={self.trace_id}] Failed to read skill {skill.name}", exc_info=True)
|
|
|
|
return messages
|
|
|
|
async def _build_initial_state(self, task: str) -> dict[str, Any]:
|
|
"""Build the initial state for agent execution.
|
|
|
|
Args:
|
|
task: The task description.
|
|
|
|
Returns:
|
|
Initial state dictionary.
|
|
"""
|
|
# Load skills as conversation items (Codex pattern)
|
|
skill_messages = await self._load_skill_messages()
|
|
|
|
messages: list = []
|
|
# Skill content injected as developer/system messages before the task
|
|
messages.extend(skill_messages)
|
|
# Then the actual task
|
|
messages.append(HumanMessage(content=task))
|
|
|
|
state: dict[str, Any] = {
|
|
"messages": messages,
|
|
}
|
|
|
|
# Pass through sandbox and thread data from parent
|
|
if self.sandbox_state is not None:
|
|
state["sandbox"] = self.sandbox_state
|
|
if self.thread_data is not None:
|
|
state["thread_data"] = self.thread_data
|
|
|
|
return state
|
|
|
|
async def _aexecute(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult:
|
|
"""Execute a task asynchronously.
|
|
|
|
Args:
|
|
task: The task description for the subagent.
|
|
result_holder: Optional pre-created result object to update during execution.
|
|
|
|
Returns:
|
|
SubagentResult with the execution result.
|
|
"""
|
|
if result_holder is not None:
|
|
# Use the provided result holder (for async execution with real-time updates)
|
|
result = result_holder
|
|
else:
|
|
# Create a new result for synchronous execution
|
|
task_id = str(uuid.uuid4())[:8]
|
|
result = SubagentResult(
|
|
task_id=task_id,
|
|
trace_id=self.trace_id,
|
|
status=SubagentStatus.RUNNING,
|
|
started_at=datetime.now(),
|
|
)
|
|
ai_messages = result.ai_messages
|
|
if ai_messages is None:
|
|
ai_messages = []
|
|
result.ai_messages = ai_messages
|
|
|
|
try:
|
|
agent = self._create_agent()
|
|
state = await self._build_initial_state(task)
|
|
|
|
# Build config with thread_id for sandbox access and recursion limit
|
|
run_config: RunnableConfig = {
|
|
"recursion_limit": self.config.max_turns,
|
|
}
|
|
context: dict[str, Any] = {}
|
|
if self.thread_id:
|
|
run_config["configurable"] = {"thread_id": self.thread_id}
|
|
context["thread_id"] = self.thread_id
|
|
if self.app_config is not None:
|
|
context["app_config"] = self.app_config
|
|
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution with max_turns={self.config.max_turns}")
|
|
|
|
# Use stream instead of invoke to get real-time updates
|
|
# This allows us to collect AI messages as they are generated
|
|
final_state = None
|
|
|
|
# Pre-check: bail out immediately if already cancelled before streaming starts
|
|
if result.cancel_event.is_set():
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} cancelled before streaming")
|
|
with _background_tasks_lock:
|
|
if result.status == SubagentStatus.RUNNING:
|
|
result.status = SubagentStatus.CANCELLED
|
|
result.error = "Cancelled by user"
|
|
result.completed_at = datetime.now()
|
|
return result
|
|
|
|
async for chunk in agent.astream(state, config=run_config, context=context, stream_mode="values"): # type: ignore[arg-type]
|
|
# Cooperative cancellation: check if parent requested stop.
|
|
# Note: cancellation is only detected at astream iteration boundaries,
|
|
# so long-running tool calls within a single iteration will not be
|
|
# interrupted until the next chunk is yielded.
|
|
if result.cancel_event.is_set():
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} cancelled by parent")
|
|
with _background_tasks_lock:
|
|
if result.status == SubagentStatus.RUNNING:
|
|
result.status = SubagentStatus.CANCELLED
|
|
result.error = "Cancelled by user"
|
|
result.completed_at = datetime.now()
|
|
return result
|
|
|
|
final_state = chunk
|
|
|
|
# Extract AI messages from the current state
|
|
messages = chunk.get("messages", [])
|
|
if messages:
|
|
last_message = messages[-1]
|
|
# Check if this is a new AI message
|
|
if isinstance(last_message, AIMessage):
|
|
# Convert message to dict for serialization
|
|
message_dict = last_message.model_dump()
|
|
# Only add if it's not already in the list (avoid duplicates)
|
|
# Check by comparing message IDs if available, otherwise compare full dict
|
|
message_id = message_dict.get("id")
|
|
is_duplicate = False
|
|
if message_id:
|
|
is_duplicate = any(msg.get("id") == message_id for msg in ai_messages)
|
|
else:
|
|
is_duplicate = message_dict in ai_messages
|
|
|
|
if not is_duplicate:
|
|
ai_messages.append(message_dict)
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} captured AI message #{len(ai_messages)}")
|
|
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} completed async execution")
|
|
|
|
if final_state is None:
|
|
logger.warning(f"[trace={self.trace_id}] Subagent {self.config.name} no final state")
|
|
result.result = "No response generated"
|
|
else:
|
|
# Extract the final message - find the last AIMessage
|
|
messages = final_state.get("messages", [])
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} final messages count: {len(messages)}")
|
|
|
|
# Find the last AIMessage in the conversation
|
|
last_ai_message = None
|
|
for msg in reversed(messages):
|
|
if isinstance(msg, AIMessage):
|
|
last_ai_message = msg
|
|
break
|
|
|
|
if last_ai_message is not None:
|
|
content = last_ai_message.content
|
|
# Handle both str and list content types for the final result
|
|
if isinstance(content, str):
|
|
result.result = content
|
|
elif isinstance(content, list):
|
|
# Extract text from list of content blocks for final result only.
|
|
# Concatenate raw string chunks directly, but preserve separation
|
|
# between full text blocks for readability.
|
|
text_parts = []
|
|
pending_str_parts = []
|
|
for block in content:
|
|
if isinstance(block, str):
|
|
pending_str_parts.append(block)
|
|
elif isinstance(block, dict):
|
|
if pending_str_parts:
|
|
text_parts.append("".join(pending_str_parts))
|
|
pending_str_parts.clear()
|
|
text_val = block.get("text")
|
|
if isinstance(text_val, str):
|
|
text_parts.append(text_val)
|
|
if pending_str_parts:
|
|
text_parts.append("".join(pending_str_parts))
|
|
result.result = "\n".join(text_parts) if text_parts else "No text content in response"
|
|
else:
|
|
result.result = str(content)
|
|
elif messages:
|
|
# Fallback: use the last message if no AIMessage found
|
|
last_message = messages[-1]
|
|
logger.warning(f"[trace={self.trace_id}] Subagent {self.config.name} no AIMessage found, using last message: {type(last_message)}")
|
|
raw_content = last_message.content if hasattr(last_message, "content") else str(last_message)
|
|
if isinstance(raw_content, str):
|
|
result.result = raw_content
|
|
elif isinstance(raw_content, list):
|
|
parts = []
|
|
pending_str_parts = []
|
|
for block in raw_content:
|
|
if isinstance(block, str):
|
|
pending_str_parts.append(block)
|
|
elif isinstance(block, dict):
|
|
if pending_str_parts:
|
|
parts.append("".join(pending_str_parts))
|
|
pending_str_parts.clear()
|
|
text_val = block.get("text")
|
|
if isinstance(text_val, str):
|
|
parts.append(text_val)
|
|
if pending_str_parts:
|
|
parts.append("".join(pending_str_parts))
|
|
result.result = "\n".join(parts) if parts else "No text content in response"
|
|
else:
|
|
result.result = str(raw_content)
|
|
else:
|
|
logger.warning(f"[trace={self.trace_id}] Subagent {self.config.name} no messages in final state")
|
|
result.result = "No response generated"
|
|
|
|
result.status = SubagentStatus.COMPLETED
|
|
result.completed_at = datetime.now()
|
|
|
|
except Exception as e:
|
|
logger.exception(f"[trace={self.trace_id}] Subagent {self.config.name} async execution failed")
|
|
result.status = SubagentStatus.FAILED
|
|
result.error = str(e)
|
|
result.completed_at = datetime.now()
|
|
|
|
return result
|
|
|
|
def _execute_in_isolated_loop(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult:
|
|
"""Execute the subagent on the persistent isolated event loop.
|
|
|
|
This method is used by the sync ``execute()`` path when the caller is
|
|
already running inside an event loop. Because ``execute()`` is a sync
|
|
API, this path blocks the caller while the actual coroutine runs on the
|
|
long-lived isolated loop. Reusing that loop keeps shared async clients
|
|
from being tied to a short-lived loop that gets closed per execution.
|
|
"""
|
|
future: Future[SubagentResult] | None = None
|
|
parent_context = copy_context()
|
|
try:
|
|
future = _submit_to_isolated_loop_in_context(
|
|
parent_context,
|
|
lambda: self._aexecute(task, result_holder),
|
|
)
|
|
return future.result(timeout=self.config.timeout_seconds)
|
|
except FuturesTimeoutError:
|
|
if result_holder is not None:
|
|
result_holder.cancel_event.set()
|
|
if future is not None:
|
|
future.cancel()
|
|
raise
|
|
except Exception:
|
|
if future is None:
|
|
logger.debug(
|
|
f"[trace={self.trace_id}] Failed to submit subagent {self.config.name} to the isolated event loop",
|
|
exc_info=True,
|
|
)
|
|
else:
|
|
logger.debug(
|
|
f"[trace={self.trace_id}] Subagent {self.config.name} failed while executing on the isolated event loop",
|
|
exc_info=True,
|
|
)
|
|
raise
|
|
|
|
def execute(self, task: str, result_holder: SubagentResult | None = None) -> SubagentResult:
|
|
"""Execute a task synchronously (wrapper around async execution).
|
|
|
|
This method runs the async execution in a new event loop, allowing
|
|
asynchronous tools (like MCP tools) to be used within the thread pool.
|
|
|
|
When called from within an already-running event loop (e.g., when the
|
|
parent agent is async), this method synchronously waits on the
|
|
persistent isolated loop to avoid event loop conflicts with shared
|
|
async primitives like httpx clients.
|
|
|
|
Args:
|
|
task: The task description for the subagent.
|
|
result_holder: Optional pre-created result object to update during execution.
|
|
|
|
Returns:
|
|
SubagentResult with the execution result.
|
|
"""
|
|
try:
|
|
try:
|
|
loop = asyncio.get_running_loop()
|
|
except RuntimeError:
|
|
loop = None
|
|
|
|
if loop is not None and loop.is_running():
|
|
logger.debug(f"[trace={self.trace_id}] Subagent {self.config.name} detected running event loop, using isolated loop")
|
|
return self._execute_in_isolated_loop(task, result_holder)
|
|
|
|
# Standard path: no running event loop, use asyncio.run
|
|
return asyncio.run(self._aexecute(task, result_holder))
|
|
except Exception as e:
|
|
logger.exception(f"[trace={self.trace_id}] Subagent {self.config.name} execution failed")
|
|
# Create a result with error if we don't have one
|
|
if result_holder is not None:
|
|
result = result_holder
|
|
else:
|
|
result = SubagentResult(
|
|
task_id=str(uuid.uuid4())[:8],
|
|
trace_id=self.trace_id,
|
|
status=SubagentStatus.FAILED,
|
|
)
|
|
result.status = SubagentStatus.FAILED
|
|
result.error = str(e)
|
|
result.completed_at = datetime.now()
|
|
return result
|
|
|
|
def execute_async(self, task: str, task_id: str | None = None) -> str:
|
|
"""Start a task execution in the background.
|
|
|
|
Args:
|
|
task: The task description for the subagent.
|
|
task_id: Optional task ID to use. If not provided, a random UUID will be generated.
|
|
|
|
Returns:
|
|
Task ID that can be used to check status later.
|
|
"""
|
|
# Use provided task_id or generate a new one
|
|
if task_id is None:
|
|
task_id = str(uuid.uuid4())[:8]
|
|
|
|
# Create initial pending result
|
|
result = SubagentResult(
|
|
task_id=task_id,
|
|
trace_id=self.trace_id,
|
|
status=SubagentStatus.PENDING,
|
|
)
|
|
|
|
logger.info(f"[trace={self.trace_id}] Subagent {self.config.name} starting async execution, task_id={task_id}, timeout={self.config.timeout_seconds}s")
|
|
|
|
with _background_tasks_lock:
|
|
_background_tasks[task_id] = result
|
|
|
|
parent_context = copy_context()
|
|
|
|
# Submit to scheduler pool
|
|
def run_task():
|
|
with _background_tasks_lock:
|
|
_background_tasks[task_id].status = SubagentStatus.RUNNING
|
|
_background_tasks[task_id].started_at = datetime.now()
|
|
result_holder = _background_tasks[task_id]
|
|
|
|
try:
|
|
# Submit execution directly to the persistent isolated loop so the
|
|
# background path does not create a temporary loop via execute().
|
|
execution_future = _submit_to_isolated_loop_in_context(
|
|
parent_context,
|
|
lambda: self._aexecute(task, result_holder),
|
|
)
|
|
try:
|
|
# Wait for execution with timeout
|
|
exec_result = execution_future.result(timeout=self.config.timeout_seconds)
|
|
with _background_tasks_lock:
|
|
_background_tasks[task_id].status = exec_result.status
|
|
_background_tasks[task_id].result = exec_result.result
|
|
_background_tasks[task_id].error = exec_result.error
|
|
_background_tasks[task_id].completed_at = datetime.now()
|
|
_background_tasks[task_id].ai_messages = exec_result.ai_messages
|
|
except FuturesTimeoutError:
|
|
logger.error(f"[trace={self.trace_id}] Subagent {self.config.name} execution timed out after {self.config.timeout_seconds}s")
|
|
with _background_tasks_lock:
|
|
if _background_tasks[task_id].status == SubagentStatus.RUNNING:
|
|
_background_tasks[task_id].status = SubagentStatus.TIMED_OUT
|
|
_background_tasks[task_id].error = f"Execution timed out after {self.config.timeout_seconds} seconds"
|
|
_background_tasks[task_id].completed_at = datetime.now()
|
|
# Signal cooperative cancellation and cancel the future
|
|
result_holder.cancel_event.set()
|
|
execution_future.cancel()
|
|
except Exception as e:
|
|
logger.exception(f"[trace={self.trace_id}] Subagent {self.config.name} async execution failed")
|
|
with _background_tasks_lock:
|
|
_background_tasks[task_id].status = SubagentStatus.FAILED
|
|
_background_tasks[task_id].error = str(e)
|
|
_background_tasks[task_id].completed_at = datetime.now()
|
|
|
|
_scheduler_pool.submit(run_task)
|
|
return task_id
|
|
|
|
|
|
MAX_CONCURRENT_SUBAGENTS = 3
|
|
|
|
|
|
def request_cancel_background_task(task_id: str) -> None:
|
|
"""Signal a running background task to stop.
|
|
|
|
Sets the cancel_event on the task, which is checked cooperatively
|
|
by ``_aexecute`` during ``agent.astream()`` iteration. This allows
|
|
subagent threads — which cannot be force-killed via ``Future.cancel()``
|
|
— to stop at the next iteration boundary.
|
|
|
|
Args:
|
|
task_id: The task ID to cancel.
|
|
"""
|
|
with _background_tasks_lock:
|
|
result = _background_tasks.get(task_id)
|
|
if result is not None:
|
|
result.cancel_event.set()
|
|
logger.info("Requested cancellation for background task %s", task_id)
|
|
|
|
|
|
def get_background_task_result(task_id: str) -> SubagentResult | None:
|
|
"""Get the result of a background task.
|
|
|
|
Args:
|
|
task_id: The task ID returned by execute_async.
|
|
|
|
Returns:
|
|
SubagentResult if found, None otherwise.
|
|
"""
|
|
with _background_tasks_lock:
|
|
return _background_tasks.get(task_id)
|
|
|
|
|
|
def list_background_tasks() -> list[SubagentResult]:
|
|
"""List all background tasks.
|
|
|
|
Returns:
|
|
List of all SubagentResult instances.
|
|
"""
|
|
with _background_tasks_lock:
|
|
return list(_background_tasks.values())
|
|
|
|
|
|
def cleanup_background_task(task_id: str) -> None:
|
|
"""Remove a completed task from background tasks.
|
|
|
|
Should be called by task_tool after it finishes polling and returns the result.
|
|
This prevents memory leaks from accumulated completed tasks.
|
|
|
|
Only removes tasks that are in a terminal state (COMPLETED/FAILED/TIMED_OUT)
|
|
to avoid race conditions with the background executor still updating the task entry.
|
|
|
|
Args:
|
|
task_id: The task ID to remove.
|
|
"""
|
|
with _background_tasks_lock:
|
|
result = _background_tasks.get(task_id)
|
|
if result is None:
|
|
# Nothing to clean up; may have been removed already.
|
|
logger.debug("Requested cleanup for unknown background task %s", task_id)
|
|
return
|
|
|
|
# Only clean up tasks that are in a terminal state to avoid races with
|
|
# the background executor still updating the task entry.
|
|
is_terminal_status = result.status in {
|
|
SubagentStatus.COMPLETED,
|
|
SubagentStatus.FAILED,
|
|
SubagentStatus.CANCELLED,
|
|
SubagentStatus.TIMED_OUT,
|
|
}
|
|
if is_terminal_status or result.completed_at is not None:
|
|
del _background_tasks[task_id]
|
|
logger.debug("Cleaned up background task: %s", task_id)
|
|
else:
|
|
logger.debug(
|
|
"Skipping cleanup for non-terminal background task %s (status=%s)",
|
|
task_id,
|
|
result.status.value if hasattr(result.status, "value") else result.status,
|
|
)
|