hermes-agent/tests/gateway/test_dead_targets.py
Teknium 290fa7fd2b
fix(gateway): skip confirmed-dead delivery targets (deleted groups, blocked bots) (#55115)
* fix(gateway): skip confirmed-dead delivery targets (deleted groups, blocked bots)

A deleted Telegram group, kicked/blocked bot, or deactivated user keeps
throwing Forbidden/not_found on every cron tick and fan-out delivery. Each
retry burns a send against the platform's flood-control envelope and spams
the logs, making the whole session feel broken even when the model call
completed.

Add a small persistent DeadTargetRegistry (per-profile JSON under
HERMES_HOME) that records a target the moment a send reports a whole-chat
death (forbidden / chat-level not_found), and have DeliveryRouter.deliver()
short-circuit it on subsequent attempts. Self-healing: any successful send
clears the flag, so a user re-adding the bot recovers with no manual cleanup.
Thread/topic-level not_found is NOT recorded (adapters already self-heal that
by retrying without reply_to). Transient/timeout errors are never marked dead.

* infographic: dead delivery target skipping
2026-06-29 13:23:29 -07:00

169 lines
6.8 KiB
Python

"""Tests for confirmed-dead delivery-target short-circuiting (deleted Telegram
groups, blocked/kicked bots, deactivated users).
Covers the full lifecycle through the real ``DeliveryRouter.deliver()`` path:
forbidden send -> target marked dead
next delivery -> short-circuited (adapter never called)
successful send -> dead flag cleared (self-healing)
and the standalone ``DeadTargetRegistry`` persistence/classification contract.
"""
import pytest
from gateway.config import GatewayConfig, Platform
from gateway.delivery import DeliveryRouter, DeliveryTarget
from gateway.dead_targets import DeadTargetRegistry
class ForbiddenThenOkAdapter:
"""First send raises a deleted-group Forbidden; subsequent sends succeed."""
def __init__(self, fail_times=1):
self.calls = []
self._fail_times = fail_times
async def send(self, chat_id, content, metadata=None):
self.calls.append(chat_id)
if len(self.calls) <= self._fail_times:
raise RuntimeError("Forbidden: the group chat was deleted")
return {"success": True}
class TransientFailAdapter:
async def send(self, chat_id, content, metadata=None):
raise RuntimeError("httpx.ReadTimeout: connection timed out")
@pytest.fixture
def isolate(tmp_path, monkeypatch):
monkeypatch.setattr("gateway.delivery.get_hermes_home", lambda: tmp_path)
monkeypatch.setattr("gateway.dead_targets.get_hermes_home", lambda: tmp_path)
return tmp_path
# --------------------------------------------------------------------------
# DeadTargetRegistry unit contract
# --------------------------------------------------------------------------
class TestDeadTargetRegistry:
def test_mark_is_dead_clear_roundtrip(self, isolate):
reg = DeadTargetRegistry()
assert reg.is_dead("telegram", "123") is False
assert reg.mark_dead("telegram", "123", "forbidden") is True
assert reg.is_dead("telegram", "123") is True
# idempotent: second mark returns False (already present)
assert reg.mark_dead("telegram", "123", "forbidden") is False
assert reg.clear("telegram", "123") is True
assert reg.is_dead("telegram", "123") is False
def test_persists_across_instances(self, isolate):
reg = DeadTargetRegistry()
reg.mark_dead("telegram", "999", "deleted group")
# New instance reads the same on-disk store under tmp HERMES_HOME.
reg2 = DeadTargetRegistry()
assert reg2.is_dead("telegram", "999") is True
def test_key_is_case_insensitive_on_platform(self, isolate):
reg = DeadTargetRegistry()
reg.mark_dead("TeleGram", "5", "x")
assert reg.is_dead("telegram", "5") is True
def test_none_chat_id_is_never_dead(self, isolate):
reg = DeadTargetRegistry()
assert reg.mark_dead("telegram", None) is False
assert reg.is_dead("telegram", None) is False
def test_is_dead_error_kind_classification(self):
assert DeadTargetRegistry.is_dead_error_kind("forbidden") is True
assert DeadTargetRegistry.is_dead_error_kind("not_found") is True
assert DeadTargetRegistry.is_dead_error_kind("rate_limited") is False
assert DeadTargetRegistry.is_dead_error_kind("transient") is False
assert DeadTargetRegistry.is_dead_error_kind(None) is False
def test_corrupt_store_degrades_to_empty(self, isolate):
path = isolate / "gateway" / "dead_targets.json"
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("{ this is not json")
reg = DeadTargetRegistry() # must not raise
assert reg.all_dead() == {}
# --------------------------------------------------------------------------
# DeliveryRouter end-to-end lifecycle
# --------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_forbidden_marks_target_dead_then_short_circuits(isolate):
adapter = ForbiddenThenOkAdapter(fail_times=99)
router = DeliveryRouter(GatewayConfig(), adapters={Platform.TELEGRAM: adapter})
target = DeliveryTarget.parse("telegram:42")
# First delivery: send raises Forbidden -> failure + target recorded dead.
res1 = await router.deliver("hi", [target])
assert res1["telegram:42"]["success"] is False
assert router.dead_targets.is_dead("telegram", "42") is True
assert adapter.calls == ["42"] # adapter was invoked once
# Second delivery: short-circuited, adapter NOT called again.
res2 = await router.deliver("hi again", [target])
assert res2["telegram:42"]["skipped"] == "dead_target"
assert res2["telegram:42"]["success"] is False
assert adapter.calls == ["42"] # still only the original call
@pytest.mark.asyncio
async def test_successful_send_clears_dead_flag(isolate):
# Fails once (gets marked dead), then succeeds.
adapter = ForbiddenThenOkAdapter(fail_times=1)
router = DeliveryRouter(GatewayConfig(), adapters={Platform.TELEGRAM: adapter})
target = DeliveryTarget.parse("telegram:7")
# Pre-seed dead via the first (failing) delivery.
await router.deliver("a", [target])
assert router.dead_targets.is_dead("telegram", "7") is True
# Manually clear to simulate the user re-adding the bot, then deliver again.
router.dead_targets.clear("telegram", "7")
res = await router.deliver("b", [target])
assert res["telegram:7"]["success"] is True
# Flag stays cleared after a successful send.
assert router.dead_targets.is_dead("telegram", "7") is False
@pytest.mark.asyncio
async def test_transient_failure_does_not_mark_dead(isolate):
adapter = TransientFailAdapter()
router = DeliveryRouter(GatewayConfig(), adapters={Platform.TELEGRAM: adapter})
target = DeliveryTarget.parse("telegram:13")
res = await router.deliver("hi", [target])
assert res["telegram:13"]["success"] is False
# A timeout/transient error must NOT mark the chat dead — it may recover.
assert router.dead_targets.is_dead("telegram", "13") is False
@pytest.mark.asyncio
async def test_local_target_is_never_dead_tracked(isolate):
router = DeliveryRouter(GatewayConfig(), adapters={})
target = DeliveryTarget.parse("local")
res = await router.deliver("hi", [target])
assert res["local"]["success"] is True
assert router.dead_targets.all_dead() == {}
@pytest.mark.asyncio
async def test_shared_registry_is_used_when_injected(isolate):
shared = DeadTargetRegistry()
shared.mark_dead("telegram", "500", "pre-existing")
adapter = ForbiddenThenOkAdapter(fail_times=0)
router = DeliveryRouter(
GatewayConfig(),
adapters={Platform.TELEGRAM: adapter},
dead_targets=shared,
)
target = DeliveryTarget.parse("telegram:500")
res = await router.deliver("hi", [target])
# Injected registry's pre-existing flag short-circuits before any send.
assert res["telegram:500"]["skipped"] == "dead_target"
assert adapter.calls == []