hermes-agent/tests/gateway/test_restart_redelivery_dedup.py
Teknium 45acd9beb5
fix(gateway): ignore redelivered /restart after PTB offset ACK fails (#11940)
When a Telegram /restart fires and PTB's graceful-shutdown `get_updates`
ACK call times out ("When polling for updates is restarted, updates may
be received twice" in gateway.log), the new gateway receives the same
/restart again and restarts a second time — a self-perpetuating loop.

Record the triggering update_id in `.restart_last_processed.json` when
handling /restart.  On the next process, reject a /restart whose
update_id <= the recorded one as a stale redelivery.  5-minute staleness
guard so an orphaned marker can't block a legitimately new /restart.

- gateway/platforms/base.py: add `platform_update_id` to MessageEvent
- gateway/platforms/telegram.py: propagate `update.update_id` through
  _build_message_event for text/command/location/media handlers
- gateway/run.py: write dedup marker in _handle_restart_command;
  _is_stale_restart_redelivery checks it before processing /restart
- tests/gateway/test_restart_redelivery_dedup.py: 9 new tests covering
  fresh restart, redelivery, staleness window, cross-platform,
  malformed-marker resilience, and no-update_id (CLI) bypass

Only active for Telegram today (the one platform with monotonic
cross-session update ordering); other platforms return False from
_is_stale_restart_redelivery and proceed normally.
2026-04-17 21:17:33 -07:00

247 lines
8.8 KiB
Python

"""Tests for /restart idempotency guard against Telegram update re-delivery.
When PTB's graceful-shutdown ACK call (the final `get_updates` on exit) fails
with a network error, Telegram re-delivers the `/restart` message to the new
gateway process. Without a dedup guard, the new gateway would process
`/restart` again and immediately restart — a self-perpetuating loop.
"""
import asyncio
import json
import time
from unittest.mock import MagicMock
import pytest
import gateway.run as gateway_run
from gateway.platforms.base import MessageEvent, MessageType
from tests.gateway.restart_test_helpers import make_restart_runner, make_restart_source
def _make_restart_event(update_id: int | None = 100) -> MessageEvent:
return MessageEvent(
text="/restart",
message_type=MessageType.TEXT,
source=make_restart_source(),
message_id="m1",
platform_update_id=update_id,
)
@pytest.mark.asyncio
async def test_restart_handler_writes_dedup_marker_with_update_id(tmp_path, monkeypatch):
"""First /restart writes .restart_last_processed.json with the triggering update_id."""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
event = _make_restart_event(update_id=12345)
result = await runner._handle_restart_command(event)
assert "Restarting gateway" in result
marker_path = tmp_path / ".restart_last_processed.json"
assert marker_path.exists()
data = json.loads(marker_path.read_text())
assert data["platform"] == "telegram"
assert data["update_id"] == 12345
assert isinstance(data["requested_at"], (int, float))
@pytest.mark.asyncio
async def test_redelivered_restart_with_same_update_id_is_ignored(tmp_path, monkeypatch):
"""A /restart with update_id <= recorded marker is silently ignored as a redelivery."""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
# Previous gateway recorded update_id=12345 a few seconds ago
marker = tmp_path / ".restart_last_processed.json"
marker.write_text(json.dumps({
"platform": "telegram",
"update_id": 12345,
"requested_at": time.time() - 5,
}))
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock()
event = _make_restart_event(update_id=12345) # same update_id → redelivery
result = await runner._handle_restart_command(event)
assert result == "" # silently ignored
runner.request_restart.assert_not_called()
@pytest.mark.asyncio
async def test_redelivered_restart_with_older_update_id_is_ignored(tmp_path, monkeypatch):
"""update_id strictly LESS than the recorded one is also a redelivery."""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
marker = tmp_path / ".restart_last_processed.json"
marker.write_text(json.dumps({
"platform": "telegram",
"update_id": 12345,
"requested_at": time.time() - 5,
}))
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock()
event = _make_restart_event(update_id=12344) # older update — shouldn't happen,
# but if Telegram does re-deliver
# something older, treat as stale
result = await runner._handle_restart_command(event)
assert result == ""
runner.request_restart.assert_not_called()
@pytest.mark.asyncio
async def test_fresh_restart_with_higher_update_id_is_processed(tmp_path, monkeypatch):
"""A NEW /restart from the user (higher update_id) bypasses the dedup guard."""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
# Previous restart recorded update_id=12345
marker = tmp_path / ".restart_last_processed.json"
marker.write_text(json.dumps({
"platform": "telegram",
"update_id": 12345,
"requested_at": time.time() - 5,
}))
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
event = _make_restart_event(update_id=12346) # strictly higher → fresh
result = await runner._handle_restart_command(event)
assert "Restarting gateway" in result
runner.request_restart.assert_called_once()
# Marker is overwritten with the new update_id
data = json.loads(marker.read_text())
assert data["update_id"] == 12346
@pytest.mark.asyncio
async def test_stale_marker_older_than_5min_does_not_block(tmp_path, monkeypatch):
"""A marker older than the 5-minute window is ignored — fresh /restart proceeds."""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
marker = tmp_path / ".restart_last_processed.json"
marker.write_text(json.dumps({
"platform": "telegram",
"update_id": 12345,
"requested_at": time.time() - 600, # 10 minutes ago
}))
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
# Same update_id as the stale marker, but the marker is too old to trust
event = _make_restart_event(update_id=12345)
result = await runner._handle_restart_command(event)
assert "Restarting gateway" in result
runner.request_restart.assert_called_once()
@pytest.mark.asyncio
async def test_no_marker_file_allows_restart(tmp_path, monkeypatch):
"""Clean gateway start (no prior marker) processes /restart normally."""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
event = _make_restart_event(update_id=100)
result = await runner._handle_restart_command(event)
assert "Restarting gateway" in result
runner.request_restart.assert_called_once()
@pytest.mark.asyncio
async def test_corrupt_marker_file_is_treated_as_absent(tmp_path, monkeypatch):
"""Malformed JSON in the marker file doesn't crash — /restart proceeds."""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
marker = tmp_path / ".restart_last_processed.json"
marker.write_text("not-json{")
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
event = _make_restart_event(update_id=100)
result = await runner._handle_restart_command(event)
assert "Restarting gateway" in result
runner.request_restart.assert_called_once()
@pytest.mark.asyncio
async def test_event_without_update_id_bypasses_dedup(tmp_path, monkeypatch):
"""Events with no platform_update_id (non-Telegram, CLI fallback) aren't gated."""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
marker = tmp_path / ".restart_last_processed.json"
marker.write_text(json.dumps({
"platform": "telegram",
"update_id": 999999,
"requested_at": time.time(),
}))
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
# No update_id — the dedup check should NOT kick in
event = _make_restart_event(update_id=None)
result = await runner._handle_restart_command(event)
assert "Restarting gateway" in result
runner.request_restart.assert_called_once()
@pytest.mark.asyncio
async def test_different_platform_bypasses_dedup(tmp_path, monkeypatch):
"""Marker from Telegram doesn't block a /restart from another platform."""
from gateway.config import Platform
from gateway.session import SessionSource
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
marker = tmp_path / ".restart_last_processed.json"
marker.write_text(json.dumps({
"platform": "telegram",
"update_id": 12345,
"requested_at": time.time(),
}))
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
# /restart from Discord — not a redelivery candidate
discord_source = SessionSource(
platform=Platform.DISCORD,
chat_id="discord-chan",
chat_type="dm",
user_id="u1",
)
event = MessageEvent(
text="/restart",
message_type=MessageType.TEXT,
source=discord_source,
message_id="m1",
platform_update_id=12345,
)
result = await runner._handle_restart_command(event)
assert "Restarting gateway" in result
runner.request_restart.assert_called_once()