hermes-agent/tools/environments/modal_utils.py
Teknium a418ddbd8b
fix: add activity heartbeats to prevent false gateway inactivity timeouts (#10501)
Multiple gaps in activity tracking could cause the gateway's inactivity
timeout to fire while the agent is actively working:

1. Streaming wait loop had no periodic heartbeat — the outer thread only
   touched activity when the stale-stream detector fired (180-300s), and
   for local providers (Ollama) the stale timeout was infinity, meaning
   zero heartbeats. Now touches activity every 30s.

2. Concurrent tool execution never set the activity callback on worker
   threads (threading.local invisible across threads) and never set
   _current_tool. Workers now set the callback, and the concurrent wait
   uses a polling loop with 30s heartbeats.

3. Modal backend's execute() override had its own polling loop without
   any activity callback. Now matches _wait_for_process cadence (10s).
2026-04-15 13:29:05 -07:00

206 lines
6.8 KiB
Python

"""Shared Hermes-side execution flow for Modal transports.
This module deliberately stops at the Hermes boundary:
- command preparation
- cwd/timeout normalization
- stdin/sudo shell wrapping
- common result shape
- interrupt/cancel polling
Direct Modal and managed Modal keep separate transport logic, persistence, and
trust-boundary decisions in their own modules.
"""
from __future__ import annotations
import shlex
import time
import uuid
from abc import abstractmethod
from dataclasses import dataclass
from typing import Any
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted
@dataclass(frozen=True)
class PreparedModalExec:
"""Normalized command data passed to a transport-specific exec runner."""
command: str
cwd: str
timeout: int
stdin_data: str | None = None
@dataclass(frozen=True)
class ModalExecStart:
"""Transport response after starting an exec."""
handle: Any | None = None
immediate_result: dict | None = None
def wrap_modal_stdin_heredoc(command: str, stdin_data: str) -> str:
"""Append stdin as a shell heredoc for transports without stdin piping."""
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
while marker in stdin_data:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
return f"{command} << '{marker}'\n{stdin_data}\n{marker}"
def wrap_modal_sudo_pipe(command: str, sudo_stdin: str) -> str:
"""Feed sudo via a shell pipe for transports without direct stdin piping."""
return f"printf '%s\\n' {shlex.quote(sudo_stdin.rstrip())} | {command}"
class BaseModalExecutionEnvironment(BaseEnvironment):
"""Execution flow for the *managed* Modal transport (gateway-owned sandbox).
This deliberately overrides :meth:`BaseEnvironment.execute` because the
tool-gateway handles command preparation, CWD tracking, and env-snapshot
management on the server side. The base class's ``_wrap_command`` /
``_wait_for_process`` / snapshot machinery does not apply here — the
gateway owns that responsibility. See ``ManagedModalEnvironment`` for the
concrete subclass.
"""
_stdin_mode = "payload"
_poll_interval_seconds = 0.25
_client_timeout_grace_seconds: float | None = None
_interrupt_output = "[Command interrupted]"
_unexpected_error_prefix = "Modal execution error"
def execute(
self,
command: str,
cwd: str = "",
*,
timeout: int | None = None,
stdin_data: str | None = None,
) -> dict:
self._before_execute()
prepared = self._prepare_modal_exec(
command,
cwd=cwd,
timeout=timeout,
stdin_data=stdin_data,
)
try:
start = self._start_modal_exec(prepared)
except Exception as exc:
return self._error_result(f"{self._unexpected_error_prefix}: {exc}")
if start.immediate_result is not None:
return start.immediate_result
if start.handle is None:
return self._error_result(
f"{self._unexpected_error_prefix}: transport did not return an exec handle"
)
deadline = None
if self._client_timeout_grace_seconds is not None:
deadline = time.monotonic() + prepared.timeout + self._client_timeout_grace_seconds
_last_activity_touch = time.monotonic()
_modal_exec_start = time.monotonic()
_ACTIVITY_INTERVAL = 10.0 # match _wait_for_process cadence
while True:
if is_interrupted():
try:
self._cancel_modal_exec(start.handle)
except Exception:
pass
return self._result(self._interrupt_output, 130)
try:
result = self._poll_modal_exec(start.handle)
except Exception as exc:
return self._error_result(f"{self._unexpected_error_prefix}: {exc}")
if result is not None:
return result
if deadline is not None and time.monotonic() >= deadline:
try:
self._cancel_modal_exec(start.handle)
except Exception:
pass
return self._timeout_result_for_modal(prepared.timeout)
# Periodic activity touch so the gateway knows we're alive
_now = time.monotonic()
if _now - _last_activity_touch >= _ACTIVITY_INTERVAL:
_last_activity_touch = _now
try:
from tools.environments.base import _get_activity_callback
_cb = _get_activity_callback()
except Exception:
_cb = None
if _cb:
try:
_elapsed = int(_now - _modal_exec_start)
_cb(f"modal command running ({_elapsed}s elapsed)")
except Exception:
pass
time.sleep(self._poll_interval_seconds)
def _before_execute(self) -> None:
"""Hook for backends that need pre-exec sync or validation."""
pass
def _prepare_modal_exec(
self,
command: str,
*,
cwd: str = "",
timeout: int | None = None,
stdin_data: str | None = None,
) -> PreparedModalExec:
effective_cwd = cwd or self.cwd
effective_timeout = timeout or self.timeout
exec_command = command
exec_stdin = stdin_data if self._stdin_mode == "payload" else None
if stdin_data is not None and self._stdin_mode == "heredoc":
exec_command = wrap_modal_stdin_heredoc(exec_command, stdin_data)
exec_command, sudo_stdin = self._prepare_command(exec_command)
if sudo_stdin is not None:
exec_command = wrap_modal_sudo_pipe(exec_command, sudo_stdin)
return PreparedModalExec(
command=exec_command,
cwd=effective_cwd,
timeout=effective_timeout,
stdin_data=exec_stdin,
)
def _result(self, output: str, returncode: int) -> dict:
return {
"output": output,
"returncode": returncode,
}
def _error_result(self, output: str) -> dict:
return self._result(output, 1)
def _timeout_result_for_modal(self, timeout: int) -> dict:
return self._result(f"Command timed out after {timeout}s", 124)
@abstractmethod
def _start_modal_exec(self, prepared: PreparedModalExec) -> ModalExecStart:
"""Begin a transport-specific exec."""
@abstractmethod
def _poll_modal_exec(self, handle: Any) -> dict | None:
"""Return a final result dict when complete, else ``None``."""
@abstractmethod
def _cancel_modal_exec(self, handle: Any) -> None:
"""Cancel or terminate the active transport exec."""