fix: browser timeout

fix: add timeout to browser tools
2026-03-18 12:04:49 -07:00 · 2026-03-18 10:51:53 -07:00
4 changed files with 204 additions and 25 deletions
@@ -3980,6 +3980,68 @@ class EventLoopNode(NodeProtocol):
        ratio_before = conversation.usage_ratio()
        phase_grad = getattr(ctx, "continuous_mode", False)

+        # Debug snapshot helper
+        def _snap(name: str, **extra: Any) -> dict[str, Any]:
+            roles: dict[str, int] = {}
+            for m in conversation.messages:
+                roles[m.role] = roles.get(m.role, 0) + 1
+            return {
+                "name": name,
+                "message_count": conversation.message_count,
+                "estimated_tokens": conversation.estimate_tokens(),
+                "usage_ratio": f"{conversation.usage_ratio():.2%}",
+                "max_context_tokens": self._config.max_context_tokens,
+                "messages_by_role": roles,
+                **extra,
+            }
+
+        initial = _snap("initial")
+
+        # When over budget, attach a full message inventory so the log
+        # shows exactly what is consuming the context window.
+        if ratio_before >= 1.0:
+            inventory: list[dict[str, Any]] = []
+            for m in conversation.messages:
+                content_chars = len(m.content)
+                tc_chars = 0
+                tool_name = None
+                if m.tool_calls:
+                    for tc in m.tool_calls:
+                        args = tc.get("function", {}).get("arguments", "")
+                        tc_chars += len(args) if isinstance(args, str) else len(json.dumps(args))
+                    names = [tc.get("function", {}).get("name", "?") for tc in m.tool_calls]
+                    tool_name = ", ".join(names)
+                elif m.role == "tool" and m.tool_use_id:
+                    # Try to find the tool name from the preceding assistant message
+                    for prev in conversation.messages:
+                        if prev.tool_calls:
+                            for tc in prev.tool_calls:
+                                if tc.get("id") == m.tool_use_id:
+                                    tool_name = tc.get("function", {}).get("name", "?")
+                                    break
+                        if tool_name:
+                            break
+                entry: dict[str, Any] = {
+                    "seq": m.seq,
+                    "role": m.role,
+                    "content_chars": content_chars,
+                }
+                if tc_chars:
+                    entry["tool_call_args_chars"] = tc_chars
+                if tool_name:
+                    entry["tool"] = tool_name
+                if m.is_error:
+                    entry["is_error"] = True
+                if m.phase_id:
+                    entry["phase"] = m.phase_id
+                # Content preview for the biggest messages
+                if content_chars > 2000:
+                    entry["preview"] = m.content[:200] + "…"
+                inventory.append(entry)
+            initial["message_inventory"] = inventory
+
+        debug_steps: list[dict[str, Any]] = [initial]
+
        # --- Step 1: Prune old tool results (free, no LLM) ---
        protect = max(2000, self._config.max_context_tokens // 12)
        pruned = await conversation.prune_old_tool_results(
@@ -3993,8 +4055,10 @@ class EventLoopNode(NodeProtocol):
                ratio_before * 100,
                conversation.usage_ratio() * 100,
            )
+        debug_steps.append(_snap("after_prune", messages_pruned=pruned))
        if not conversation.needs_compaction():
            await self._log_compaction(ctx, conversation, ratio_before)
+            self._write_compaction_debug_log(ctx, debug_steps)
            return

        # --- Step 2: Standard structure-preserving compaction (free, no LLM) ---
@@ -4006,8 +4070,14 @@ class EventLoopNode(NodeProtocol):
                keep_recent=4,
                phase_graduated=phase_grad,
            )
+            debug_steps.append(_snap(
+                "after_structural",
+                spillover_dir=spill_dir,
+                keep_recent=4,
+            ))
        if not conversation.needs_compaction():
            await self._log_compaction(ctx, conversation, ratio_before)
+            self._write_compaction_debug_log(ctx, debug_steps)
            return

        # --- Step 3: LLM summary compaction ---
@@ -4030,11 +4100,20 @@ class EventLoopNode(NodeProtocol):
                    keep_recent=2,
                    phase_graduated=phase_grad,
                )
+                debug_steps.append(_snap(
+                    "after_llm_compact",
+                    summary_chars=len(summary),
+                ))
            except Exception as e:
                logger.warning("LLM compaction failed: %s", e)
+                debug_steps.append(_snap(
+                    "llm_compact_failed",
+                    error=str(e),
+                ))

        if not conversation.needs_compaction():
            await self._log_compaction(ctx, conversation, ratio_before)
+            self._write_compaction_debug_log(ctx, debug_steps)
            return

        # --- Step 4: Emergency deterministic summary (LLM failed/unavailable) ---
@@ -4048,7 +4127,12 @@ class EventLoopNode(NodeProtocol):
            keep_recent=1,
            phase_graduated=phase_grad,
        )
+        debug_steps.append(_snap(
+            "after_emergency",
+            summary_chars=len(summary),
+        ))
        await self._log_compaction(ctx, conversation, ratio_before)
+        self._write_compaction_debug_log(ctx, debug_steps)

    # --- LLM compaction with binary-search splitting ----------------------

@@ -4262,6 +4346,91 @@ class EventLoopNode(NodeProtocol):
                )
            )

+    @staticmethod
+    def _write_compaction_debug_log(
+        ctx: NodeContext,
+        steps: list[dict[str, Any]],
+    ) -> None:
+        """Write detailed compaction analysis to ~/.hive/compaction_log/.
+
+        Only runs when HIVE_COMPACTION_DEBUG is set in the environment.
+        Each compaction produces a timestamped markdown file.
+        """
+        import os
+
+        if not os.environ.get("HIVE_COMPACTION_DEBUG"):
+            return
+
+        log_dir = Path.home() / ".hive" / "compaction_log"
+        log_dir.mkdir(parents=True, exist_ok=True)
+
+        ts = datetime.now(UTC).strftime("%Y%m%dT%H%M%S_%f")
+        node_label = ctx.node_id.replace("/", "_")
+        log_path = log_dir / f"{ts}_{node_label}.md"
+
+        lines: list[str] = []
+        lines.append(f"# Compaction Debug — {ctx.node_id}")
+        lines.append(f"**Time:** {datetime.now(UTC).isoformat()}")
+        lines.append(f"**Node:** {ctx.node_spec.name} (`{ctx.node_id}`)")
+        if ctx.stream_id:
+            lines.append(f"**Stream:** {ctx.stream_id}")
+        lines.append("")
+
+        for step in steps:
+            name = step.get("name", "unknown")
+            lines.append(f"## Step: {name}")
+            for key, val in step.items():
+                if key == "name":
+                    continue
+                if key == "messages_by_role":
+                    lines.append(f"- **{key}:**")
+                    for role, count in val.items():
+                        lines.append(f"  - {role}: {count}")
+                elif key == "message_inventory":
+                    total_chars = sum(e.get("content_chars", 0) + e.get("tool_call_args_chars", 0) for e in val)
+                    lines.append(f"### Message Inventory ({len(val)} messages, {total_chars:,} total chars)")
+                    lines.append("")
+                    # Sort descending by size for the table
+                    ranked = sorted(val, key=lambda e: e.get("content_chars", 0) + e.get("tool_call_args_chars", 0), reverse=True)
+                    lines.append("| # | seq | role | tool | chars | % of total | flags |")
+                    lines.append("|---|-----|------|------|------:|------------|-------|")
+                    for i, entry in enumerate(ranked, 1):
+                        chars = entry.get("content_chars", 0) + entry.get("tool_call_args_chars", 0)
+                        pct = (chars / total_chars * 100) if total_chars else 0
+                        tool = entry.get("tool", "")
+                        flags = []
+                        if entry.get("is_error"):
+                            flags.append("error")
+                        if entry.get("phase"):
+                            flags.append(f"phase={entry['phase']}")
+                        lines.append(
+                            f"| {i} | {entry['seq']} | {entry['role']} | {tool} "
+                            f"| {chars:,} | {pct:.1f}% | {', '.join(flags)} |"
+                        )
+                    # Previews for large messages
+                    large = [e for e in ranked if e.get("preview")]
+                    if large:
+                        lines.append("")
+                        lines.append("#### Large message previews")
+                        for entry in large:
+                            lines.append(f"\n**seq={entry['seq']}** ({entry['role']}, {entry.get('tool', '')}):")
+                            lines.append(f"```\n{entry['preview']}\n```")
+                elif key == "discarded_messages":
+                    lines.append(f"- **{key}:** ({len(val)} messages)")
+                    for msg_info in val[:50]:  # cap at 50
+                        lines.append(f"  - seq={msg_info['seq']} role={msg_info['role']} chars={msg_info['chars']}")
+                    if len(val) > 50:
+                        lines.append(f"  - ... and {len(val) - 50} more")
+                else:
+                    lines.append(f"- **{key}:** {val}")
+            lines.append("")
+
+        try:
+            log_path.write_text("\n".join(lines), encoding="utf-8")
+            logger.debug("Compaction debug log written to %s", log_path)
+        except OSError:
+            logger.debug("Failed to write compaction debug log to %s", log_path)
+
    def _build_emergency_summary(
        self,
        ctx: NodeContext,
@@ -210,6 +210,16 @@ def configure_logging(
    # printed on every single completion call).  Warnings and errors still show.
    logging.getLogger("LiteLLM").setLevel(logging.WARNING)

+    # Suppress the "Provider List: ..." banner litellm prints to stdout via
+    # print() on every completion call.  This is independent of log format.
+    try:
+        import litellm as _litellm
+
+        if hasattr(_litellm, "suppress_debug_info"):
+            _litellm.suppress_debug_info = True  # type: ignore[attr-defined]
+    except (ImportError, AttributeError):
+        pass
+
    # When in JSON mode, configure known third-party loggers to use JSON formatter
    # This ensures libraries like LiteLLM, httpcore also output clean JSON
    if format == "json":
@@ -232,16 +242,6 @@ def _disable_third_party_colors() -> None:
    os.environ["NO_COLOR"] = "1"
    os.environ["FORCE_COLOR"] = "0"

-    # Disable LiteLLM debug/verbose output colors if available
-    try:
-        import litellm
-
-        # LiteLLM respects NO_COLOR, but we can also suppress debug info
-        if hasattr(litellm, "suppress_debug_info"):
-            litellm.suppress_debug_info = True  # type: ignore[attr-defined]
-    except (ImportError, AttributeError):
-        pass
-

 def set_trace_context(**kwargs: Any) -> None:
    """
@@ -60,7 +60,6 @@
      "integrity": "sha512-CGOfOJqWjg2qW/Mb6zNsDm+u5vFQ8DxXfbM09z69p5Z6+mE1ikP2jUXw+j42Pf1XTYED2Rni5f95npYeuwMDQA==",
      "dev": true,
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "@babel/code-frame": "^7.29.0",
        "@babel/generator": "^7.29.0",
@@ -1557,7 +1556,6 @@
      "integrity": "sha512-4K3bqJpXpqfg2XKGK9bpDTc6xO/xoUP/RBWS7AtRMug6zZFaRekiLzjVtAoZMquxoAbzBvy5nxQ7veS5eYzf8A==",
      "dev": true,
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "undici-types": "~7.18.0"
      }
@@ -1573,7 +1571,6 @@
      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.28.tgz",
      "integrity": "sha512-z9VXpC7MWrhfWipitjNdgCauoMLRdIILQsAEV+ZesIzBq/oUlxk0m3ApZuMFCXdnS4U7KrI+l3WRUEGQ8K1QKw==",
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "@types/prop-types": "*",
        "csstype": "^3.2.2"
@@ -1786,7 +1783,6 @@
        }
      ],
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "baseline-browser-mapping": "^2.9.0",
        "caniuse-lite": "^1.0.30001759",
@@ -3564,7 +3560,6 @@
      "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
      "dev": true,
      "license": "MIT",
-      "peer": true,
      "engines": {
        "node": ">=12"
      },
@@ -3616,7 +3611,6 @@
      "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz",
      "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==",
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "loose-envify": "^1.1.0"
      },
@@ -3629,7 +3623,6 @@
      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz",
      "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==",
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "loose-envify": "^1.1.0",
        "scheduler": "^0.23.2"
@@ -4190,7 +4183,6 @@
      "integrity": "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g==",
      "dev": true,
      "license": "MIT",
-      "peer": true,
      "dependencies": {
        "esbuild": "^0.25.0",
        "fdir": "^6.4.4",
@@ -409,6 +409,8 @@ class BrowserSession:
        We're already inside ``self._lock`` so we can't call ``stop()``.
        This mirrors the teardown logic without re-acquiring the lock.
        """
+        _CLOSE_TIMEOUT = 10.0  # seconds
+
        if self.cdp_port:
            from .port_manager import release_port

@@ -417,21 +419,21 @@ class BrowserSession:

        if self.context:
            try:
-                await self.context.close()
+                await asyncio.wait_for(self.context.close(), timeout=_CLOSE_TIMEOUT)
            except Exception:
                pass
            self.context = None

        if self.browser:
            try:
-                await self.browser.close()
+                await asyncio.wait_for(self.browser.close(), timeout=_CLOSE_TIMEOUT)
            except Exception:
                pass
            self.browser = None

        if self._playwright:
            try:
-                await self._playwright.stop()
+                await asyncio.wait_for(self._playwright.stop(), timeout=_CLOSE_TIMEOUT)
            except Exception:
                pass
            self._playwright = None
@@ -588,6 +590,10 @@ class BrowserSession:

    async def stop(self) -> dict:
        """Stop the browser and clean up resources."""
+        # Timeout for each Playwright teardown call — prevents hanging when
+        # the browser process is crashed or unresponsive.
+        _CLOSE_TIMEOUT = 10.0  # seconds
+
        async with self._lock:
            # Release CDP port if allocated
            if self.cdp_port:
@@ -598,23 +604,35 @@ class BrowserSession:

            # Close context (works for both persistent and ephemeral)
            if self.context:
-                await self.context.close()
+                try:
+                    await asyncio.wait_for(self.context.close(), timeout=_CLOSE_TIMEOUT)
+                except Exception as exc:
+                    logger.warning("context.close() failed for profile %r: %s", self.profile, exc)
                self.context = None

            # Agent sessions share a browser — don't close it (other agents depend on it).
            # Only standard sessions own their browser and playwright instances.
            if self.session_type != "agent":
                if self.browser:
-                    await self.browser.close()
+                    try:
+                        await asyncio.wait_for(self.browser.close(), timeout=_CLOSE_TIMEOUT)
+                    except Exception as exc:
+                        logger.warning("browser.close() failed for profile %r: %s", self.profile, exc)
                    self.browser = None

                if self._playwright:
-                    await self._playwright.stop()
+                    try:
+                        await asyncio.wait_for(self._playwright.stop(), timeout=_CLOSE_TIMEOUT)
+                    except Exception as exc:
+                        logger.warning("playwright.stop() failed for profile %r: %s", self.profile, exc)
                    self._playwright = None

                # Kill the Chrome subprocess
                if self._chrome_process:
-                    await self._chrome_process.kill()
+                    try:
+                        await self._chrome_process.kill()
+                    except Exception as exc:
+                        logger.warning("chrome_process.kill() failed for profile %r: %s", self.profile, exc)
                    self._chrome_process = None
            else:
                self.browser = None  # Drop reference to shared browser
Author	SHA1	Message	Date
Timothy	2cb54595c9	fix: browser timeout	2026-03-18 12:04:49 -07:00
Timothy	284079d18b	fix: add timeout to browser tools	2026-03-18 10:51:53 -07:00