mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
When a Telegram /restart fires and PTB's graceful-shutdown `get_updates`
ACK call times out ("When polling for updates is restarted, updates may
be received twice" in gateway.log), the new gateway receives the same
/restart again and restarts a second time — a self-perpetuating loop.
Record the triggering update_id in `.restart_last_processed.json` when
handling /restart. On the next process, reject a /restart whose
update_id <= the recorded one as a stale redelivery. 5-minute staleness
guard so an orphaned marker can't block a legitimately new /restart.
- gateway/platforms/base.py: add `platform_update_id` to MessageEvent
- gateway/platforms/telegram.py: propagate `update.update_id` through
_build_message_event for text/command/location/media handlers
- gateway/run.py: write dedup marker in _handle_restart_command;
_is_stale_restart_redelivery checks it before processing /restart
- tests/gateway/test_restart_redelivery_dedup.py: 9 new tests covering
fresh restart, redelivery, staleness window, cross-platform,
malformed-marker resilience, and no-update_id (CLI) bypass
Only active for Telegram today (the one platform with monotonic
cross-session update ordering); other platforms return False from
_is_stale_restart_redelivery and proceed normally.
247 lines
8.8 KiB
Python
247 lines
8.8 KiB
Python
"""Tests for /restart idempotency guard against Telegram update re-delivery.
|
|
|
|
When PTB's graceful-shutdown ACK call (the final `get_updates` on exit) fails
|
|
with a network error, Telegram re-delivers the `/restart` message to the new
|
|
gateway process. Without a dedup guard, the new gateway would process
|
|
`/restart` again and immediately restart — a self-perpetuating loop.
|
|
"""
|
|
import asyncio
|
|
import json
|
|
import time
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
import gateway.run as gateway_run
|
|
from gateway.platforms.base import MessageEvent, MessageType
|
|
from tests.gateway.restart_test_helpers import make_restart_runner, make_restart_source
|
|
|
|
|
|
def _make_restart_event(update_id: int | None = 100) -> MessageEvent:
|
|
return MessageEvent(
|
|
text="/restart",
|
|
message_type=MessageType.TEXT,
|
|
source=make_restart_source(),
|
|
message_id="m1",
|
|
platform_update_id=update_id,
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_restart_handler_writes_dedup_marker_with_update_id(tmp_path, monkeypatch):
|
|
"""First /restart writes .restart_last_processed.json with the triggering update_id."""
|
|
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
|
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
|
|
|
runner, _adapter = make_restart_runner()
|
|
runner.request_restart = MagicMock(return_value=True)
|
|
|
|
event = _make_restart_event(update_id=12345)
|
|
result = await runner._handle_restart_command(event)
|
|
|
|
assert "Restarting gateway" in result
|
|
marker_path = tmp_path / ".restart_last_processed.json"
|
|
assert marker_path.exists()
|
|
data = json.loads(marker_path.read_text())
|
|
assert data["platform"] == "telegram"
|
|
assert data["update_id"] == 12345
|
|
assert isinstance(data["requested_at"], (int, float))
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_redelivered_restart_with_same_update_id_is_ignored(tmp_path, monkeypatch):
|
|
"""A /restart with update_id <= recorded marker is silently ignored as a redelivery."""
|
|
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
|
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
|
|
|
# Previous gateway recorded update_id=12345 a few seconds ago
|
|
marker = tmp_path / ".restart_last_processed.json"
|
|
marker.write_text(json.dumps({
|
|
"platform": "telegram",
|
|
"update_id": 12345,
|
|
"requested_at": time.time() - 5,
|
|
}))
|
|
|
|
runner, _adapter = make_restart_runner()
|
|
runner.request_restart = MagicMock()
|
|
|
|
event = _make_restart_event(update_id=12345) # same update_id → redelivery
|
|
result = await runner._handle_restart_command(event)
|
|
|
|
assert result == "" # silently ignored
|
|
runner.request_restart.assert_not_called()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_redelivered_restart_with_older_update_id_is_ignored(tmp_path, monkeypatch):
|
|
"""update_id strictly LESS than the recorded one is also a redelivery."""
|
|
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
|
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
|
|
|
marker = tmp_path / ".restart_last_processed.json"
|
|
marker.write_text(json.dumps({
|
|
"platform": "telegram",
|
|
"update_id": 12345,
|
|
"requested_at": time.time() - 5,
|
|
}))
|
|
|
|
runner, _adapter = make_restart_runner()
|
|
runner.request_restart = MagicMock()
|
|
|
|
event = _make_restart_event(update_id=12344) # older update — shouldn't happen,
|
|
# but if Telegram does re-deliver
|
|
# something older, treat as stale
|
|
result = await runner._handle_restart_command(event)
|
|
|
|
assert result == ""
|
|
runner.request_restart.assert_not_called()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_fresh_restart_with_higher_update_id_is_processed(tmp_path, monkeypatch):
|
|
"""A NEW /restart from the user (higher update_id) bypasses the dedup guard."""
|
|
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
|
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
|
|
|
# Previous restart recorded update_id=12345
|
|
marker = tmp_path / ".restart_last_processed.json"
|
|
marker.write_text(json.dumps({
|
|
"platform": "telegram",
|
|
"update_id": 12345,
|
|
"requested_at": time.time() - 5,
|
|
}))
|
|
|
|
runner, _adapter = make_restart_runner()
|
|
runner.request_restart = MagicMock(return_value=True)
|
|
|
|
event = _make_restart_event(update_id=12346) # strictly higher → fresh
|
|
result = await runner._handle_restart_command(event)
|
|
|
|
assert "Restarting gateway" in result
|
|
runner.request_restart.assert_called_once()
|
|
|
|
# Marker is overwritten with the new update_id
|
|
data = json.loads(marker.read_text())
|
|
assert data["update_id"] == 12346
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_stale_marker_older_than_5min_does_not_block(tmp_path, monkeypatch):
|
|
"""A marker older than the 5-minute window is ignored — fresh /restart proceeds."""
|
|
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
|
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
|
|
|
marker = tmp_path / ".restart_last_processed.json"
|
|
marker.write_text(json.dumps({
|
|
"platform": "telegram",
|
|
"update_id": 12345,
|
|
"requested_at": time.time() - 600, # 10 minutes ago
|
|
}))
|
|
|
|
runner, _adapter = make_restart_runner()
|
|
runner.request_restart = MagicMock(return_value=True)
|
|
|
|
# Same update_id as the stale marker, but the marker is too old to trust
|
|
event = _make_restart_event(update_id=12345)
|
|
result = await runner._handle_restart_command(event)
|
|
|
|
assert "Restarting gateway" in result
|
|
runner.request_restart.assert_called_once()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_marker_file_allows_restart(tmp_path, monkeypatch):
|
|
"""Clean gateway start (no prior marker) processes /restart normally."""
|
|
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
|
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
|
|
|
runner, _adapter = make_restart_runner()
|
|
runner.request_restart = MagicMock(return_value=True)
|
|
|
|
event = _make_restart_event(update_id=100)
|
|
result = await runner._handle_restart_command(event)
|
|
|
|
assert "Restarting gateway" in result
|
|
runner.request_restart.assert_called_once()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_corrupt_marker_file_is_treated_as_absent(tmp_path, monkeypatch):
|
|
"""Malformed JSON in the marker file doesn't crash — /restart proceeds."""
|
|
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
|
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
|
|
|
marker = tmp_path / ".restart_last_processed.json"
|
|
marker.write_text("not-json{")
|
|
|
|
runner, _adapter = make_restart_runner()
|
|
runner.request_restart = MagicMock(return_value=True)
|
|
|
|
event = _make_restart_event(update_id=100)
|
|
result = await runner._handle_restart_command(event)
|
|
|
|
assert "Restarting gateway" in result
|
|
runner.request_restart.assert_called_once()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_event_without_update_id_bypasses_dedup(tmp_path, monkeypatch):
|
|
"""Events with no platform_update_id (non-Telegram, CLI fallback) aren't gated."""
|
|
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
|
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
|
|
|
marker = tmp_path / ".restart_last_processed.json"
|
|
marker.write_text(json.dumps({
|
|
"platform": "telegram",
|
|
"update_id": 999999,
|
|
"requested_at": time.time(),
|
|
}))
|
|
|
|
runner, _adapter = make_restart_runner()
|
|
runner.request_restart = MagicMock(return_value=True)
|
|
|
|
# No update_id — the dedup check should NOT kick in
|
|
event = _make_restart_event(update_id=None)
|
|
result = await runner._handle_restart_command(event)
|
|
|
|
assert "Restarting gateway" in result
|
|
runner.request_restart.assert_called_once()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_different_platform_bypasses_dedup(tmp_path, monkeypatch):
|
|
"""Marker from Telegram doesn't block a /restart from another platform."""
|
|
from gateway.config import Platform
|
|
from gateway.session import SessionSource
|
|
|
|
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
|
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
|
|
|
marker = tmp_path / ".restart_last_processed.json"
|
|
marker.write_text(json.dumps({
|
|
"platform": "telegram",
|
|
"update_id": 12345,
|
|
"requested_at": time.time(),
|
|
}))
|
|
|
|
runner, _adapter = make_restart_runner()
|
|
runner.request_restart = MagicMock(return_value=True)
|
|
|
|
# /restart from Discord — not a redelivery candidate
|
|
discord_source = SessionSource(
|
|
platform=Platform.DISCORD,
|
|
chat_id="discord-chan",
|
|
chat_type="dm",
|
|
user_id="u1",
|
|
)
|
|
event = MessageEvent(
|
|
text="/restart",
|
|
message_type=MessageType.TEXT,
|
|
source=discord_source,
|
|
message_id="m1",
|
|
platform_update_id=12345,
|
|
)
|
|
result = await runner._handle_restart_command(event)
|
|
|
|
assert "Restarting gateway" in result
|
|
runner.request_restart.assert_called_once()
|