Files
deer-flow/backend/packages/harness/deerflow/actor/system.py
T
greatmengqi 228a2a66e3 fix(actor): harden lifecycle, supervision, Redis mailbox, and add comprehensive tests
- Fix spawn() zombie cell: clean up registry on start() failure
- Fix shutdown(): cancel + await tasks that exceed graceful timeout
- Fix _shutdown(): await mailbox.close() to release backend resources
- Fix escalate directive: stop failing child before propagating to grandparent
- Fix RedisMailbox.put(): wrap Redis errors in try/except, return False on failure
- Fix retry.py: replace assert with proper raise for last_exc
- Add put_batch() to Mailbox abstraction for single-roundtrip bulk enqueue
- Add RedisMailbox.put_batch() with atomic Lua script for bounded queues
- Add MailboxFullError exception type for semantic backpressure handling
- Add redis>=7.4.0 dependency with public PyPI sources in uv.lock

Tests added (31 total, up from 27):
- test_middleware_on_restart_hook: verifies middleware.on_restart() on supervision restart
- test_ask_propagates_actor_exception: ask() re-raises original exception type
- test_ask_propagates_exception_while_supervised: exception propagates; root actor survives
- test_ask_timeout_late_reply_no_exception: late reply after timeout is silent no-op
- test_actor_backpressure.py: MailboxFullError + dead letter on full mailbox
- test_actor_retry.py: ask_with_retry with exponential backoff
- test_mailbox_redis.py: RedisMailbox put/get/batch/close
- bench_actor_redis.py: RedisMailbox throughput benchmarks
2026-03-31 10:09:05 +08:00

417 lines
17 KiB
Python

"""ActorSystem — top-level actor container and lifecycle manager."""
from __future__ import annotations
import asyncio
import logging
from collections import deque
from dataclasses import dataclass
from typing import Any
from .actor import Actor, ActorContext
from .mailbox import Empty, Mailbox, MemoryMailbox
from .middleware import ActorMailboxContext, Middleware, NextFn, build_middleware_chain
from .ref import ActorRef, ActorStoppedError, MailboxFullError, ReplyChannel, _Envelope, _ReplyMessage, _ReplyRegistry, _Stop
from .supervision import Directive, SupervisorStrategy
logger = logging.getLogger(__name__)
# Timeout for middleware lifecycle hooks (on_started/on_stopped)
_MIDDLEWARE_HOOK_TIMEOUT = 10.0
# Maximum dead letters kept in memory
_MAX_DEAD_LETTERS = 10000
# Maximum consecutive failures before a root actor poison-quarantines a message
_MAX_CONSECUTIVE_FAILURES = 10
@dataclass
class DeadLetter:
"""A message that could not be delivered."""
recipient: ActorRef
message: Any
sender: ActorRef | None
class ActorSystem:
"""Top-level actor container.
Manages root actors and provides the dead letter sink.
"""
def __init__(
self,
name: str = "system",
*,
max_dead_letters: int = _MAX_DEAD_LETTERS,
executor_workers: int | None = 4,
reply_channel: ReplyChannel | None = None,
) -> None:
import uuid as _uuid
self.name = name
self.system_id = f"{name}-{_uuid.uuid4().hex[:8]}"
self._root_cells: dict[str, _ActorCell] = {}
self._dead_letters: deque[DeadLetter] = deque(maxlen=max_dead_letters)
self._on_dead_letter: list[Any] = []
self._shutting_down = False
self._replies = _ReplyRegistry()
self._reply_channel = reply_channel or ReplyChannel()
# Shared thread pool for actors to run blocking I/O
from concurrent.futures import ThreadPoolExecutor
self._executor = ThreadPoolExecutor(max_workers=executor_workers, thread_name_prefix=f"actor-{name}") if executor_workers else None
async def spawn(
self,
actor_cls: type[Actor],
name: str,
*,
mailbox_size: int = 256,
mailbox: Mailbox | None = None,
middlewares: list[Middleware] | None = None,
) -> ActorRef:
"""Spawn a root-level actor.
Args:
mailbox: Custom mailbox instance. If None, uses MemoryMailbox(mailbox_size).
"""
if name in self._root_cells:
raise ValueError(f"Root actor '{name}' already exists")
cell = _ActorCell(
actor_cls=actor_cls,
name=name,
parent=None,
system=self,
mailbox=mailbox or MemoryMailbox(mailbox_size),
middlewares=middlewares or [],
)
self._root_cells[name] = cell
try:
await cell.start()
except Exception:
del self._root_cells[name]
raise
return cell.ref
async def shutdown(self, *, timeout: float = 10.0) -> None:
"""Gracefully stop all actors."""
self._shutting_down = True
tasks = []
for cell in list(self._root_cells.values()):
cell.request_stop()
if cell.task is not None:
tasks.append(cell.task)
if tasks:
_, pending = await asyncio.wait(tasks, timeout=timeout)
# Cancel tasks that didn't finish within the timeout to prevent zombie tasks
for t in pending:
t.cancel()
if pending:
await asyncio.wait(pending, timeout=2.0)
self._root_cells.clear()
self._replies.reject_all(ActorStoppedError("ActorSystem shutting down"))
await self._reply_channel.stop_listener()
if self._executor is not None:
self._executor.shutdown(wait=False)
logger.info("ActorSystem '%s' shut down (%d dead letters)", self.name, len(self._dead_letters))
def _dead_letter(self, recipient: ActorRef, message: Any, sender: ActorRef | None) -> None:
dl = DeadLetter(recipient=recipient, message=message, sender=sender)
self._dead_letters.append(dl)
for cb in self._on_dead_letter:
try:
cb(dl)
except Exception:
pass
logger.debug("Dead letter: %s%s", type(message).__name__, recipient.path)
def on_dead_letter(self, callback: Any) -> None:
"""Register a dead letter listener."""
self._on_dead_letter.append(callback)
@property
def dead_letters(self) -> list[DeadLetter]:
return list(self._dead_letters)
# ---------------------------------------------------------------------------
# _ActorCell — internal runtime wrapper
# ---------------------------------------------------------------------------
class _ActorCell:
"""Runtime container for a single actor instance.
Manages the mailbox, processing loop, children, and supervision.
Not part of the public API.
"""
def __init__(
self,
actor_cls: type[Actor],
name: str,
parent: _ActorCell | None,
system: ActorSystem,
mailbox: Mailbox,
middlewares: list[Middleware] | None = None,
) -> None:
self.actor_cls = actor_cls
self.name = name
self.parent = parent
self.system = system
self.children: dict[str, _ActorCell] = {}
self.mailbox = mailbox
self.ref = ActorRef(self)
self.actor: Actor | None = None
self.task: asyncio.Task[None] | None = None
self.stopped = False
self._supervisor_strategy: SupervisorStrategy | None = None
self._middlewares = middlewares or []
self._receive_chain: NextFn | None = None
# Cache path (immutable after init — parent never changes)
parts: list[str] = []
cell: _ActorCell | None = self
while cell is not None:
parts.append(cell.name)
cell = cell.parent
parts.append(system.name)
self.path = "/" + "/".join(reversed(parts))
async def start(self) -> None:
self.actor = self.actor_cls()
self.actor.context = ActorContext(self)
async def _inner_handler(_ctx: ActorMailboxContext, message: Any) -> Any:
return await self.actor.on_receive(message) # type: ignore[union-attr]
if self._middlewares:
self._receive_chain = build_middleware_chain(self._middlewares, _inner_handler)
else:
self._receive_chain = _inner_handler
# Notify middleware of start (with timeout to prevent blocking)
for mw in self._middlewares:
try:
await asyncio.wait_for(mw.on_started(self.ref), timeout=_MIDDLEWARE_HOOK_TIMEOUT)
except asyncio.TimeoutError:
logger.warning("Middleware %s.on_started timed out for %s", type(mw).__name__, self.path)
await self.actor.on_started()
self.task = asyncio.create_task(self._run(), name=f"actor:{self.path}")
async def enqueue(self, msg: _Envelope | _Stop) -> None:
# Try non-blocking first (fast path for MemoryMailbox)
if self.mailbox.put_nowait(msg):
return
# Fallback to async put (required for Redis and other async backends)
if not await self.mailbox.put(msg):
if isinstance(msg, _Envelope) and msg.correlation_id is not None:
self.system._replies.reject(msg.correlation_id, MailboxFullError(f"Mailbox full: {self.path}"))
elif isinstance(msg, _Envelope):
self.system._dead_letter(self.ref, msg.payload, msg.sender)
def request_stop(self) -> None:
"""Request graceful shutdown.
Tries put_nowait first. If that fails (full or unsupported backend),
cancels the task directly so _run exits via CancelledError → finally → _shutdown.
"""
if not self.stopped:
if not self.mailbox.put_nowait(_Stop()):
# Redis/async backends can't put_nowait — cancel the task
if self.task is not None and not self.task.done():
self.task.cancel()
else:
self.stopped = True
async def spawn_child(
self,
actor_cls: type[Actor],
name: str,
*,
mailbox_size: int = 256,
mailbox: Mailbox | None = None,
middlewares: list[Middleware] | None = None,
) -> ActorRef:
if name in self.children:
raise ValueError(f"Child '{name}' already exists under {self.path}")
child = _ActorCell(
actor_cls=actor_cls,
name=name,
parent=self,
system=self.system,
mailbox=mailbox or MemoryMailbox(mailbox_size),
middlewares=middlewares or [],
)
self.children[name] = child
try:
await child.start()
except Exception:
del self.children[name]
raise
return child.ref
# -- Processing loop -------------------------------------------------------
async def _run(self) -> None:
consecutive_failures = 0
try:
while not self.stopped:
try:
msg = await self.mailbox.get()
except asyncio.CancelledError:
break
if isinstance(msg, _Stop):
break
try:
if not isinstance(msg, _Envelope):
continue
msg_type = "ask" if msg.correlation_id else "tell"
ctx = ActorMailboxContext(self.ref, msg.sender, msg_type)
result = await self._receive_chain(ctx, msg.payload) # type: ignore[misc]
if msg.correlation_id is not None:
reply = _ReplyMessage(msg.correlation_id, result=result)
await self.system._reply_channel.send_reply(msg.reply_to or self.system.system_id, reply, self.system._replies)
consecutive_failures = 0
except Exception as exc:
if isinstance(msg, _Envelope) and msg.correlation_id is not None:
reply = _ReplyMessage(msg.correlation_id, error=str(exc), exception=exc)
await self.system._reply_channel.send_reply(msg.reply_to or self.system.system_id, reply, self.system._replies)
if self.parent is not None:
await self.parent._handle_child_failure(self, exc)
else:
consecutive_failures += 1
logger.error("Uncaught error in root actor %s (%d/%d): %s", self.path, consecutive_failures, _MAX_CONSECUTIVE_FAILURES, exc)
if consecutive_failures >= _MAX_CONSECUTIVE_FAILURES:
logger.error("Root actor %s hit consecutive failure limit — stopping", self.path)
break
except asyncio.CancelledError:
pass # Fall through to _shutdown
finally:
await self._shutdown()
async def _shutdown(self) -> None:
self.stopped = True
# Parallel child shutdown prevents cascading timeouts.
child_tasks = []
for child in list(self.children.values()):
child.request_stop()
if child.task is not None:
child_tasks.append(child.task)
if child_tasks:
_, pending = await asyncio.wait(child_tasks, timeout=10.0)
for t in pending:
t.cancel()
# Mark leaked children as stopped
for child in self.children.values():
if child.task is t:
child.stopped = True
# Drain mailbox → dead letters (use try/except to handle all backends)
while True:
try:
msg = self.mailbox.get_nowait()
except Empty:
break
if isinstance(msg, _Envelope):
if msg.correlation_id is not None:
self.system._replies.reject(msg.correlation_id, ActorStoppedError(f"Actor {self.path} stopped"))
else:
self.system._dead_letter(self.ref, msg.payload, msg.sender)
# Lifecycle hook
for mw in self._middlewares:
try:
await asyncio.wait_for(mw.on_stopped(self.ref), timeout=_MIDDLEWARE_HOOK_TIMEOUT)
except asyncio.TimeoutError:
logger.warning("Middleware %s.on_stopped timed out for %s", type(mw).__name__, self.path)
except Exception:
logger.exception("Error in middleware on_stopped for %s", self.path)
if self.actor is not None:
try:
await self.actor.on_stopped()
except Exception:
logger.exception("Error in on_stopped for %s", self.path)
# Remove from parent
if self.parent is not None:
self.parent.children.pop(self.name, None)
# Close mailbox to release backend resources (e.g. Redis connections)
try:
await self.mailbox.close()
except Exception:
logger.exception("Error closing mailbox for %s", self.path)
# -- Supervision -----------------------------------------------------------
def _get_supervisor_strategy(self) -> SupervisorStrategy:
if self._supervisor_strategy is None:
self._supervisor_strategy = self.actor.supervisor_strategy() # type: ignore[union-attr]
return self._supervisor_strategy
async def _handle_child_failure(self, child: _ActorCell, error: Exception) -> None:
strategy = self._get_supervisor_strategy()
directive = strategy.decide(error)
affected = strategy.apply_to_children(child.name, list(self.children.keys()))
if directive == Directive.resume:
logger.info("Supervisor %s: resume %s after %s", self.path, child.path, type(error).__name__)
return
if directive == Directive.stop:
for name in affected:
c = self.children.get(name)
if c is not None:
c.request_stop()
logger.info("Supervisor %s: stop %s after %s", self.path, [self.children[n].path for n in affected if n in self.children], type(error).__name__)
return
if directive == Directive.escalate:
# Stop the failing child, then propagate failure up the supervision chain.
# We cannot use `raise error` here — that would crash the child's _run
# loop instead of notifying the grandparent's supervisor.
child.request_stop()
if self.parent is not None:
logger.info("Supervisor %s: escalate %s to grandparent %s", self.path, type(error).__name__, self.parent.path)
await self.parent._handle_child_failure(self, error)
else:
logger.error("Uncaught escalation at root actor %s: %s", self.path, error)
return
if directive == Directive.restart:
for name in affected:
c = self.children.get(name)
if c is None:
continue
if not strategy.record_restart(name):
logger.warning("Supervisor %s: child %s exceeded restart limit — stopping", self.path, c.path)
c.request_stop()
continue
await self._restart_child(c, error)
async def _restart_child(self, child: _ActorCell, error: Exception) -> None:
logger.info("Supervisor %s: restarting %s after %s", self.path, child.path, type(error).__name__)
# Stop the old actor (but keep the cell and mailbox)
old_actor = child.actor
if old_actor is not None:
try:
await old_actor.on_stopped()
except Exception:
logger.exception("Error in on_stopped during restart of %s", child.path)
# Notify middleware of restart (reset per-instance state)
for mw in child._middlewares:
try:
await asyncio.wait_for(mw.on_restart(child.ref, error), timeout=_MIDDLEWARE_HOOK_TIMEOUT)
except asyncio.TimeoutError:
logger.warning("Middleware %s.on_restart timed out for %s", type(mw).__name__, child.path)
except Exception:
logger.exception("Error in middleware on_restart for %s", child.path)
# Create fresh instance
new_actor = child.actor_cls()
new_actor.context = ActorContext(child)
child.actor = new_actor
try:
await new_actor.on_restart(error)
await new_actor.on_started()
except Exception:
logger.exception("Error during restart initialization of %s", child.path)
child.request_stop()