mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix(gateway): scale-to-zero never armed — arm-gate counted disabled placeholder platforms (#52831)
The scale-to-zero idle watcher never started on a correctly-opted-in, relay-only instance, so the gateway never ran its idle decision, never called go_dormant(), and never sent going_idle to the connector. Fly's autostop still suspended the machine on traffic-idle, but the connector never flipped the instance to buffered-only — so an inbound DM took the live delivery path, found no live session for the suspended machine, and was dropped fail-closed with no wake poke. The machine slept and never woke. Root cause: _scale_to_zero_should_arm() passed list(config.platforms.keys()) to messaging_is_relay_only_or_absent(). config.platforms is pre-seeded with a DISABLED placeholder PlatformConfig for every known platform (telegram, discord, slack, matrix, …), so the key set is always the full ~20-entry catalog regardless of what the instance actually runs. The relay-only check discarded "relay", saw the disabled placeholders as live direct-socket platforms, and returned False — so should_arm() was False and the watcher was never created. Verified live on a staging instance: config.platforms keys = [telegram, discord, slack, mattermost, matrix, relay] with only relay enabled=True; should_arm() = False. Fix: filter config.platforms to ENABLED entries before the relay-only check, mirroring the adapter-connect loop which already gates on `if not platform_config.enabled: continue`. This arms off the same notion of "active platform" the rest of start() already uses — no parallel concept. Also add a one-line not-armed diagnostic: when an instance IS opted in (the HERMES_SCALE_TO_ZERO stamp is set) but the watcher still doesn't arm, log why (relay_only_or_absent, the enabled platforms, wake_url present/missing). A non-opted instance stays silent. The arm path previously logged only on success, so a failed arm was invisible. Tests: the existing pure-helper tests passed bare names so they never exercised the call site that feeds the placeholder-laden config. Add behaviour-contract tests against the REAL _scale_to_zero_should_arm with a realistic config.platforms (relay enabled + others disabled). The F25 regression test (relay-only + disabled placeholders must arm) and the no-platform case are RED without this fix, GREEN with it; the genuinely-enabled-direct-platform / not-opted-in / no-wake-url cases stay correctly non-arming so the filter can't over-broaden. Wake mechanism itself verified healthy independently (direct wakeUrl GET resumed a suspended staging instance in 1.15s, clean resume signature).
This commit is contained in:
parent
a4091e49f1
commit
dedf5643d8
2 changed files with 151 additions and 1 deletions
|
|
@ -3637,7 +3637,20 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
)
|
||||
|
||||
try:
|
||||
platforms = list(self.config.platforms.keys()) if self.config else []
|
||||
# Only ENABLED platforms count. `config.platforms` is pre-seeded with a
|
||||
# disabled placeholder PlatformConfig for every KNOWN platform (telegram,
|
||||
# discord, slack, …), so `.keys()` is the full ~20-entry catalog regardless
|
||||
# of what this instance actually runs. Passing the bare keys made
|
||||
# `messaging_is_relay_only_or_absent` see those placeholders as live
|
||||
# direct-socket platforms and return False, so scale-to-zero NEVER armed on
|
||||
# a real relay-only instance. Mirror the connect loop, which already gates on
|
||||
# `platform_config.enabled` (see the `if not platform_config.enabled: continue`
|
||||
# in the adapter-connect loop) — arm off the same notion of "active platform."
|
||||
platforms = (
|
||||
[p for p, pc in self.config.platforms.items() if getattr(pc, "enabled", False)]
|
||||
if self.config
|
||||
else []
|
||||
)
|
||||
except Exception: # noqa: BLE001
|
||||
platforms = []
|
||||
try:
|
||||
|
|
@ -3650,6 +3663,52 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
wake_url=wake_url,
|
||||
)
|
||||
|
||||
def _log_scale_to_zero_not_armed_reason(self) -> None:
|
||||
"""Log why the idle watcher did NOT arm — but only for an OPTED-IN instance.
|
||||
|
||||
A non-opted instance (no HERMES_SCALE_TO_ZERO stamp) not arming is the normal
|
||||
case and must stay silent. When the Labs stamp IS set but the watcher still
|
||||
didn't arm, that's the surprising case worth one INFO line so "why won't it
|
||||
suspend/wake?" is a log grep, not a box-dive.
|
||||
"""
|
||||
from gateway.relay import relay_wake_url
|
||||
from gateway.scale_to_zero import (
|
||||
messaging_is_relay_only_or_absent,
|
||||
scale_to_zero_enabled,
|
||||
)
|
||||
|
||||
try:
|
||||
enabled = scale_to_zero_enabled()
|
||||
if not enabled:
|
||||
return # not opted in — normal, stay quiet
|
||||
try:
|
||||
active = (
|
||||
[
|
||||
getattr(p, "value", p)
|
||||
for p, pc in self.config.platforms.items()
|
||||
if getattr(pc, "enabled", False)
|
||||
]
|
||||
if self.config
|
||||
else []
|
||||
)
|
||||
except Exception: # noqa: BLE001
|
||||
active = []
|
||||
relay_only = messaging_is_relay_only_or_absent(active)
|
||||
try:
|
||||
wake_url = relay_wake_url()
|
||||
except Exception: # noqa: BLE001
|
||||
wake_url = None
|
||||
logger.info(
|
||||
"scale-to-zero: NOT armed despite opt-in — "
|
||||
"relay_only_or_absent=%s (enabled platforms=%s), wake_url=%s. "
|
||||
"Need relay-only messaging + a registered wake URL.",
|
||||
relay_only,
|
||||
active or "none",
|
||||
"set" if wake_url else "MISSING",
|
||||
)
|
||||
except Exception: # noqa: BLE001 - diagnostics must never block startup
|
||||
logger.debug("scale-to-zero: not-armed reason logging failed", exc_info=True)
|
||||
|
||||
def _scale_to_zero_is_idle(self) -> bool:
|
||||
from gateway.scale_to_zero import is_idle
|
||||
|
||||
|
|
@ -6150,6 +6209,11 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
self._scale_to_zero_idle_timeout_seconds(),
|
||||
)
|
||||
asyncio.create_task(self._scale_to_zero_watcher())
|
||||
else:
|
||||
# Surface WHY an OPTED-IN instance didn't arm (a non-opted instance
|
||||
# not arming is normal — stay silent there). Without this, a failed
|
||||
# arm is invisible and "why won't it suspend/wake?" needs a box-dive.
|
||||
self._log_scale_to_zero_not_armed_reason()
|
||||
except Exception: # noqa: BLE001 - arming must never block startup
|
||||
logger.debug("scale-to-zero: arm check failed at startup", exc_info=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -149,3 +149,89 @@ def test_bg_work_false_when_quiet():
|
|||
r._background_tasks = set()
|
||||
# No background tasks, no active processes in this fresh process.
|
||||
assert r._scale_to_zero_has_live_background_work() is False
|
||||
|
||||
|
||||
# ── _scale_to_zero_should_arm: the CALL SITE feeds config.platforms (the F25 bug) ──
|
||||
#
|
||||
# config.platforms is pre-seeded with a DISABLED placeholder PlatformConfig for every
|
||||
# known platform, so list(config.platforms.keys()) is always the full ~20-entry catalog
|
||||
# regardless of what the instance runs. The arm check must filter to ENABLED platforms
|
||||
# (mirroring the connect loop) before asking messaging_is_relay_only_or_absent — passing
|
||||
# the bare placeholder keys made it see disabled `discord`/`telegram`/… as live direct
|
||||
# platforms and refuse to arm on a real relay-only instance. The pure-helper tests in
|
||||
# test_scale_to_zero.py pass bare names so they never exercised this call site.
|
||||
|
||||
|
||||
def _arm_runner(monkeypatch, platform_states, *, enabled=True, wake_url="https://wake.example"):
|
||||
"""Build a GatewayRunner stand-in whose config.platforms mirrors a real load:
|
||||
`platform_states` is {Platform: enabled_bool}; everything runs the REAL
|
||||
_scale_to_zero_should_arm. Only the env flag + wake_url resolution are stubbed."""
|
||||
from types import SimpleNamespace
|
||||
|
||||
from gateway.config import PlatformConfig
|
||||
|
||||
r = GatewayRunner.__new__(GatewayRunner)
|
||||
platforms = {p: PlatformConfig(enabled=en) for p, en in platform_states.items()}
|
||||
r.config = SimpleNamespace(platforms=platforms)
|
||||
|
||||
monkeypatch.setattr("gateway.scale_to_zero.scale_to_zero_enabled", lambda *a, **k: enabled)
|
||||
monkeypatch.setattr("gateway.relay.relay_wake_url", lambda: wake_url)
|
||||
return r
|
||||
|
||||
|
||||
def test_arm_true_for_relay_only_with_disabled_placeholders(monkeypatch):
|
||||
"""The F25 regression test: relay ENABLED, every other platform present but
|
||||
DISABLED (the real load_gateway_config() shape). Must arm — the disabled
|
||||
placeholders must NOT count as live direct-socket platforms."""
|
||||
from gateway.platforms.base import Platform
|
||||
|
||||
r = _arm_runner(
|
||||
monkeypatch,
|
||||
{
|
||||
Platform.TELEGRAM: False,
|
||||
Platform.DISCORD: False,
|
||||
Platform.SLACK: False,
|
||||
Platform.MATRIX: False,
|
||||
Platform.RELAY: True,
|
||||
},
|
||||
)
|
||||
assert r._scale_to_zero_should_arm() is True
|
||||
|
||||
|
||||
def test_no_arm_when_a_direct_platform_is_actually_enabled(monkeypatch):
|
||||
"""A genuinely-enabled direct-socket platform (real Discord token) DOES disarm —
|
||||
the filter must not over-broaden to 'ignore everything but relay'."""
|
||||
from gateway.platforms.base import Platform
|
||||
|
||||
r = _arm_runner(
|
||||
monkeypatch,
|
||||
{Platform.DISCORD: True, Platform.RELAY: True},
|
||||
)
|
||||
assert r._scale_to_zero_should_arm() is False
|
||||
|
||||
|
||||
def test_arm_when_no_platform_enabled_at_all(monkeypatch):
|
||||
"""Chronos-only / no-messaging agent (all placeholders disabled) can scale to zero."""
|
||||
from gateway.platforms.base import Platform
|
||||
|
||||
r = _arm_runner(
|
||||
monkeypatch,
|
||||
{Platform.TELEGRAM: False, Platform.DISCORD: False},
|
||||
)
|
||||
assert r._scale_to_zero_should_arm() is True
|
||||
|
||||
|
||||
def test_no_arm_when_not_opted_in(monkeypatch):
|
||||
"""Relay-only but the Labs stamp is off ⇒ never arm (fail-safe default)."""
|
||||
from gateway.platforms.base import Platform
|
||||
|
||||
r = _arm_runner(monkeypatch, {Platform.RELAY: True}, enabled=False)
|
||||
assert r._scale_to_zero_should_arm() is False
|
||||
|
||||
|
||||
def test_no_arm_without_wake_url(monkeypatch):
|
||||
"""Relay-only + opted in but no registered wake URL ⇒ no arm (§3.4(1))."""
|
||||
from gateway.platforms.base import Platform
|
||||
|
||||
r = _arm_runner(monkeypatch, {Platform.RELAY: True}, wake_url=None)
|
||||
assert r._scale_to_zero_should_arm() is False
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue