fix(whatsapp): restart stale bridge processes instead of silently reusing them (#44205)

A long-lived Baileys bridge survives gateway restarts AND hermes update:
connect() adopted any bridge already listening with status connected, and
disconnect() only kills bridges the adapter spawned itself. Users who
updated to get inbound media support kept talking to a bridge process
serving months-old bridge.js — images and voice notes still arrived as
placeholders with no cached file path (refs #19105 follow-up reports).

Three fixes in the same stale-bridge class:

- Staleness handshake: bridge.js reports a sha256 self-hash in /health
  (scriptHash); connect() compares it against bridge.js on disk and
  restarts the bridge on mismatch. Pre-handshake bridges report no hash
  and are treated as stale, so every existing stale bridge gets recycled
  exactly once on the next gateway start.
- npm dep refresh: deps reinstall when package.json changes (stamp file
  in node_modules), not only when node_modules is missing — a Baileys
  pin bump now actually lands.
- Cache-dir passthrough: the gateway passes profile-aware
  HERMES_{IMAGE,AUDIO,DOCUMENT}_CACHE_DIR to the bridge instead of the
  bridge hardcoding ~/.hermes/image_cache etc., fixing media paths under
  HERMES_HOME overrides, profiles, and the new cache/ layout.
This commit is contained in:
Teknium 2026-06-11 03:47:29 -07:00 committed by GitHub
parent 875aa8f162
commit 3edd09a46f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 435 additions and 12 deletions

View file

@ -191,6 +191,22 @@ from gateway.platforms.base import (
)
def _file_content_hash(path: Path) -> str:
"""Return the first 16 hex chars of the SHA-256 of *path*'s contents.
Used for the bridge staleness handshake: bridge.js reports its own
source hash in ``/health`` (``scriptHash``), and the adapter compares
it against the hash of bridge.js currently on disk. A mismatch means
a long-lived bridge process is serving code from before an update.
Returns ``""`` when the file can't be read.
"""
import hashlib
try:
return hashlib.sha256(path.read_bytes()).hexdigest()[:16]
except OSError:
return ""
def check_whatsapp_requirements() -> bool:
"""
Check if WhatsApp dependencies are available.
@ -587,9 +603,21 @@ class WhatsAppAdapter(BasePlatformAdapter):
logger.warning("[%s] Could not acquire session lock (non-fatal): %s", self.name, e)
try:
# Auto-install npm dependencies if node_modules doesn't exist
# Auto-install npm dependencies when node_modules is missing OR
# package.json changed since the last install (e.g. after
# `hermes update` bumps the Baileys pin). The stamp file records
# the package.json hash of the last successful install.
bridge_dir = bridge_path.parent
if not (bridge_dir / "node_modules").exists():
_pkg_json = bridge_dir / "package.json"
_dep_stamp = bridge_dir / "node_modules" / ".hermes-pkg-hash"
_pkg_hash = _file_content_hash(_pkg_json)
_deps_fresh = False
if (bridge_dir / "node_modules").exists():
try:
_deps_fresh = (_dep_stamp.read_text().strip() == _pkg_hash) and bool(_pkg_hash)
except OSError:
_deps_fresh = False
if not _deps_fresh:
print(f"[{self.name}] Installing WhatsApp bridge dependencies...")
# Resolve npm path so Windows can execute the .cmd shim.
# shutil.which honours PATHEXT; on POSIX it returns the
@ -610,6 +638,11 @@ class WhatsAppAdapter(BasePlatformAdapter):
print(f"[{self.name}] npm install failed: {install_result.stderr}")
return False
print(f"[{self.name}] Dependencies installed")
if _pkg_hash:
try:
_dep_stamp.write_text(_pkg_hash)
except OSError:
pass # Stamp is an optimization; install still succeeded
except Exception as e:
print(f"[{self.name}] Failed to install dependencies: {e}")
return False
@ -629,12 +662,28 @@ class WhatsAppAdapter(BasePlatformAdapter):
data = await resp.json()
bridge_status = data.get("status", "unknown")
if bridge_status == "connected":
print(f"[{self.name}] Using existing bridge (status: {bridge_status})")
self._mark_connected()
self._bridge_process = None # Not managed by us
self._http_session = aiohttp.ClientSession()
self._poll_task = asyncio.create_task(self._poll_messages())
return True
# Staleness handshake: only reuse a running
# bridge if it is serving the same bridge.js
# that is on disk right now. A long-lived
# bridge survives gateway restarts AND
# `hermes update`, so without this check it
# keeps serving pre-update code forever
# (e.g. no inbound media download). Old
# bridges that don't report scriptHash are
# treated as stale by definition.
running_hash = data.get("scriptHash", "")
disk_hash = _file_content_hash(bridge_path)
if running_hash and disk_hash and running_hash == disk_hash:
print(f"[{self.name}] Using existing bridge (status: {bridge_status})")
self._mark_connected()
self._bridge_process = None # Not managed by us
self._http_session = aiohttp.ClientSession()
self._poll_task = asyncio.create_task(self._poll_messages())
return True
print(
f"[{self.name}] Running bridge is stale "
f"(running={running_hash or 'unversioned'}, disk={disk_hash}), restarting"
)
else:
print(f"[{self.name}] Bridge found but not connected (status: {bridge_status}), restarting")
except Exception:
@ -659,6 +708,18 @@ class WhatsAppAdapter(BasePlatformAdapter):
bridge_env = os.environ.copy()
if self._reply_prefix is not None:
bridge_env["WHATSAPP_REPLY_PREFIX"] = self._reply_prefix
# Pass the profile-aware cache directories so the bridge writes
# media where the Python side reads it. Without these the bridge
# hardcodes ~/.hermes/{image,audio,document}_cache, which diverges
# under HERMES_HOME overrides, profiles, and the new cache/ layout.
from gateway.platforms.base import (
get_audio_cache_dir as _get_audio_dir,
get_document_cache_dir as _get_doc_dir,
get_image_cache_dir as _get_img_dir,
)
bridge_env["HERMES_IMAGE_CACHE_DIR"] = str(_get_img_dir())
bridge_env["HERMES_AUDIO_CACHE_DIR"] = str(_get_audio_dir())
bridge_env["HERMES_DOCUMENT_CACHE_DIR"] = str(_get_doc_dir())
self._bridge_process = subprocess.Popen(
[

View file

@ -24,7 +24,8 @@ import { Boom } from '@hapi/boom';
import pino from 'pino';
import path from 'path';
import { mkdirSync, readFileSync, writeFileSync, existsSync, readdirSync, unlinkSync } from 'fs';
import { randomBytes } from 'crypto';
import { fileURLToPath } from 'url';
import { randomBytes, createHash } from 'crypto';
import { execSync } from 'child_process';
import { tmpdir } from 'os';
import qrcode from 'qrcode-terminal';
@ -45,9 +46,28 @@ const WHATSAPP_DEBUG =
const PORT = parseInt(getArg('port', '3000'), 10);
const SESSION_DIR = getArg('session', path.join(process.env.HOME || '~', '.hermes', 'whatsapp', 'session'));
const IMAGE_CACHE_DIR = path.join(process.env.HOME || '~', '.hermes', 'image_cache');
const DOCUMENT_CACHE_DIR = path.join(process.env.HOME || '~', '.hermes', 'document_cache');
const AUDIO_CACHE_DIR = path.join(process.env.HOME || '~', '.hermes', 'audio_cache');
// Cache directories: the Python gateway passes the profile-aware paths via
// env (HERMES_HOME-aware, new cache/ layout). Fall back to the legacy
// hardcoded locations for bridges launched outside the gateway.
const IMAGE_CACHE_DIR = process.env.HERMES_IMAGE_CACHE_DIR
|| path.join(process.env.HOME || '~', '.hermes', 'image_cache');
const DOCUMENT_CACHE_DIR = process.env.HERMES_DOCUMENT_CACHE_DIR
|| path.join(process.env.HOME || '~', '.hermes', 'document_cache');
const AUDIO_CACHE_DIR = process.env.HERMES_AUDIO_CACHE_DIR
|| path.join(process.env.HOME || '~', '.hermes', 'audio_cache');
// Self-hash of this script file. Reported in /health so the Python gateway
// can detect a running bridge that predates the current bridge.js and
// restart it instead of silently reusing stale code (stale-bridge trap:
// `hermes update` updates bridge.js on disk but a long-lived bridge process
// keeps serving the old behavior forever).
let SCRIPT_HASH = '';
try {
SCRIPT_HASH = createHash('sha256')
.update(readFileSync(fileURLToPath(import.meta.url)))
.digest('hex')
.slice(0, 16);
} catch {}
const PAIR_ONLY = args.includes('--pair-only');
const WHATSAPP_MODE = getArg('mode', process.env.WHATSAPP_MODE || 'self-chat'); // "bot" or "self-chat"
const ALLOWED_USERS = parseAllowedUsers(process.env.WHATSAPP_ALLOWED_USERS || '');
@ -700,6 +720,7 @@ app.get('/health', (req, res) => {
status: connectionState,
queueLength: messageQueue.length,
uptime: process.uptime(),
scriptHash: SCRIPT_HASH,
});
});

View file

@ -0,0 +1,341 @@
"""Tests for the WhatsApp stale-bridge staleness handshake.
Regression tests for the stale-bridge trap: ``connect()`` reused any
already-running bridge with ``status: connected`` unconditionally, and
``disconnect()`` only kills bridges the adapter spawned itself. A
long-lived bridge process therefore survived gateway restarts AND
``hermes update``, serving pre-update bridge.js behavior forever (e.g.
no inbound media download images/voice notes arrive as placeholders).
The fix: bridge.js reports a hash of its own source in ``/health``
(``scriptHash``); the adapter compares it against the bridge.js on disk
and restarts the bridge on mismatch. Bridges that predate the handshake
report no hash and are treated as stale by definition.
Also covers the npm dependency-refresh stamp: deps are reinstalled when
package.json changes, not only when node_modules is missing.
"""
import asyncio
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from gateway.config import Platform
class _AsyncCM:
"""Minimal async context manager returning a fixed value."""
def __init__(self, value):
self.value = value
async def __aenter__(self):
return self.value
async def __aexit__(self, *exc):
return False
def _make_adapter(bridge_script: str = "/tmp/test-bridge.js",
session_path: Path = Path("/tmp/test-wa-session")):
"""Create a WhatsAppAdapter with test attributes (bypass __init__)."""
from gateway.platforms.whatsapp import WhatsAppAdapter
adapter = WhatsAppAdapter.__new__(WhatsAppAdapter)
adapter.platform = Platform.WHATSAPP
adapter.config = MagicMock()
adapter._bridge_port = 19876
adapter._bridge_script = bridge_script
adapter._session_path = session_path
adapter._bridge_log_fh = None
adapter._bridge_log = None
adapter._bridge_process = None
adapter._reply_prefix = None
adapter._running = False
adapter._message_handler = None
adapter._fatal_error_code = None
adapter._fatal_error_message = None
adapter._fatal_error_retryable = True
adapter._fatal_error_handler = None
adapter._active_sessions = {}
adapter._pending_messages = {}
adapter._background_tasks = set()
adapter._auto_tts_disabled_chats = set()
adapter._message_queue = asyncio.Queue()
adapter._http_session = None
return adapter
def _mock_health(json_data):
"""Mock aiohttp.ClientSession whose GET returns 200 + *json_data*."""
mock_resp = MagicMock()
mock_resp.status = 200
mock_resp.json = AsyncMock(return_value=json_data)
mock_session = MagicMock()
mock_session.get = MagicMock(return_value=_AsyncCM(mock_resp))
mock_session.close = AsyncMock()
return MagicMock(return_value=_AsyncCM(mock_session))
def _setup_bridge_dir(tmp_path: Path) -> Path:
"""Create a real bridge dir with bridge.js + package.json + creds."""
bridge_dir = tmp_path / "whatsapp-bridge"
bridge_dir.mkdir()
(bridge_dir / "bridge.js").write_text("// current bridge code\n")
(bridge_dir / "package.json").write_text('{"name": "bridge"}\n')
session_path = tmp_path / "session"
session_path.mkdir()
(session_path / "creds.json").write_text("{}")
return bridge_dir
def _fresh_node_modules(bridge_dir: Path) -> None:
"""Create node_modules with a stamp matching the current package.json."""
from gateway.platforms.whatsapp import _file_content_hash
nm = bridge_dir / "node_modules"
nm.mkdir()
(nm / ".hermes-pkg-hash").write_text(
_file_content_hash(bridge_dir / "package.json")
)
class TestFileContentHash:
def test_hashes_file(self, tmp_path):
from gateway.platforms.whatsapp import _file_content_hash
f = tmp_path / "x.js"
f.write_text("abc")
h = _file_content_hash(f)
assert len(h) == 16
assert h == _file_content_hash(f) # deterministic
def test_changes_with_content(self, tmp_path):
from gateway.platforms.whatsapp import _file_content_hash
f = tmp_path / "x.js"
f.write_text("abc")
h1 = _file_content_hash(f)
f.write_text("def")
assert _file_content_hash(f) != h1
def test_missing_file_returns_empty(self, tmp_path):
from gateway.platforms.whatsapp import _file_content_hash
assert _file_content_hash(tmp_path / "nope.js") == ""
def test_matches_bridge_js_self_hash_algorithm(self, tmp_path):
"""Python and Node must compute the same hash for the same bytes."""
import hashlib
from gateway.platforms.whatsapp import _file_content_hash
f = tmp_path / "bridge.js"
f.write_bytes(b"const x = 1;\n")
# Node side: createHash('sha256').update(bytes).digest('hex').slice(0, 16)
expected = hashlib.sha256(b"const x = 1;\n").hexdigest()[:16]
assert _file_content_hash(f) == expected
class TestStaleBridgeHandshake:
@pytest.mark.asyncio
async def test_reuses_bridge_when_hash_matches(self, tmp_path):
from gateway.platforms.whatsapp import _file_content_hash
bridge_dir = _setup_bridge_dir(tmp_path)
_fresh_node_modules(bridge_dir)
adapter = _make_adapter(
bridge_script=str(bridge_dir / "bridge.js"),
session_path=tmp_path / "session",
)
disk_hash = _file_content_hash(bridge_dir / "bridge.js")
mock_client = _mock_health({"status": "connected", "scriptHash": disk_hash})
with patch("gateway.platforms.whatsapp.check_whatsapp_requirements", return_value=True), \
patch("aiohttp.ClientSession", mock_client), \
patch("gateway.platforms.whatsapp.asyncio.create_task") as mock_task, \
patch("subprocess.Popen") as mock_popen, \
patch.object(adapter, "_acquire_platform_lock", return_value=True, create=True), \
patch.object(adapter, "_mark_connected", create=True):
result = await adapter.connect()
assert result is True
mock_popen.assert_not_called() # reused, never spawned
mock_task.assert_called_once()
@pytest.mark.asyncio
async def test_restarts_bridge_on_hash_mismatch(self, tmp_path):
bridge_dir = _setup_bridge_dir(tmp_path)
_fresh_node_modules(bridge_dir)
adapter = _make_adapter(
bridge_script=str(bridge_dir / "bridge.js"),
session_path=tmp_path / "session",
)
mock_client = _mock_health(
{"status": "connected", "scriptHash": "deadbeefdeadbeef"}
)
# Spawned bridge dies immediately → connect() returns False, but the
# assertion that matters is that the stale bridge was NOT reused and
# a new process spawn was attempted.
mock_proc = MagicMock()
mock_proc.poll.return_value = 1
mock_proc.returncode = 1
with patch("gateway.platforms.whatsapp.check_whatsapp_requirements", return_value=True), \
patch("aiohttp.ClientSession", mock_client), \
patch("gateway.platforms.whatsapp.asyncio.sleep", new_callable=AsyncMock), \
patch("gateway.platforms.whatsapp._kill_stale_bridge_by_pidfile"), \
patch("gateway.platforms.whatsapp._kill_port_process") as mock_kill_port, \
patch("subprocess.Popen", return_value=mock_proc) as mock_popen, \
patch.object(adapter, "_acquire_platform_lock", return_value=True, create=True):
result = await adapter.connect()
assert result is False # mock proc died; not the point of the test
mock_popen.assert_called_once() # stale bridge replaced, not reused
mock_kill_port.assert_called_once_with(adapter._bridge_port)
@pytest.mark.asyncio
async def test_restarts_unversioned_bridge(self, tmp_path):
"""Bridges predating the handshake report no scriptHash → stale."""
bridge_dir = _setup_bridge_dir(tmp_path)
_fresh_node_modules(bridge_dir)
adapter = _make_adapter(
bridge_script=str(bridge_dir / "bridge.js"),
session_path=tmp_path / "session",
)
# Old bridge /health payload: no scriptHash key at all
mock_client = _mock_health({"status": "connected"})
mock_proc = MagicMock()
mock_proc.poll.return_value = 1
mock_proc.returncode = 1
with patch("gateway.platforms.whatsapp.check_whatsapp_requirements", return_value=True), \
patch("aiohttp.ClientSession", mock_client), \
patch("gateway.platforms.whatsapp.asyncio.sleep", new_callable=AsyncMock), \
patch("gateway.platforms.whatsapp._kill_stale_bridge_by_pidfile"), \
patch("gateway.platforms.whatsapp._kill_port_process"), \
patch("subprocess.Popen", return_value=mock_proc) as mock_popen, \
patch.object(adapter, "_acquire_platform_lock", return_value=True, create=True):
await adapter.connect()
mock_popen.assert_called_once()
class TestDepRefreshStamp:
@pytest.mark.asyncio
async def test_skips_install_when_stamp_fresh(self, tmp_path):
bridge_dir = _setup_bridge_dir(tmp_path)
_fresh_node_modules(bridge_dir)
adapter = _make_adapter(
bridge_script=str(bridge_dir / "bridge.js"),
session_path=tmp_path / "session",
)
mock_proc = MagicMock()
mock_proc.poll.return_value = 1
mock_proc.returncode = 1
with patch("gateway.platforms.whatsapp.check_whatsapp_requirements", return_value=True), \
patch("aiohttp.ClientSession", _mock_health({"status": "disconnected"})), \
patch("gateway.platforms.whatsapp.asyncio.sleep", new_callable=AsyncMock), \
patch("gateway.platforms.whatsapp._kill_stale_bridge_by_pidfile"), \
patch("gateway.platforms.whatsapp._kill_port_process"), \
patch("subprocess.run") as mock_run, \
patch("subprocess.Popen", return_value=mock_proc), \
patch.object(adapter, "_acquire_platform_lock", return_value=True, create=True):
await adapter.connect()
mock_run.assert_not_called()
@pytest.mark.asyncio
async def test_reinstalls_when_package_json_changed(self, tmp_path):
bridge_dir = _setup_bridge_dir(tmp_path)
_fresh_node_modules(bridge_dir)
# Simulate `hermes update` bumping the Baileys pin
(bridge_dir / "package.json").write_text('{"name": "bridge", "v": 2}\n')
adapter = _make_adapter(
bridge_script=str(bridge_dir / "bridge.js"),
session_path=tmp_path / "session",
)
mock_proc = MagicMock()
mock_proc.poll.return_value = 1
mock_proc.returncode = 1
with patch("gateway.platforms.whatsapp.check_whatsapp_requirements", return_value=True), \
patch("aiohttp.ClientSession", _mock_health({"status": "disconnected"})), \
patch("gateway.platforms.whatsapp.asyncio.sleep", new_callable=AsyncMock), \
patch("gateway.platforms.whatsapp._kill_stale_bridge_by_pidfile"), \
patch("gateway.platforms.whatsapp._kill_port_process"), \
patch("subprocess.run", return_value=MagicMock(returncode=0)) as mock_run, \
patch("subprocess.Popen", return_value=mock_proc), \
patch.object(adapter, "_acquire_platform_lock", return_value=True, create=True):
await adapter.connect()
mock_run.assert_called_once()
assert "install" in mock_run.call_args[0][0]
# Stamp updated to the new package.json hash
from gateway.platforms.whatsapp import _file_content_hash
stamp = (bridge_dir / "node_modules" / ".hermes-pkg-hash").read_text().strip()
assert stamp == _file_content_hash(bridge_dir / "package.json")
@pytest.mark.asyncio
async def test_installs_when_node_modules_missing(self, tmp_path):
bridge_dir = _setup_bridge_dir(tmp_path) # no node_modules
adapter = _make_adapter(
bridge_script=str(bridge_dir / "bridge.js"),
session_path=tmp_path / "session",
)
mock_proc = MagicMock()
mock_proc.poll.return_value = 1
mock_proc.returncode = 1
def _npm_install(*args, **kwargs):
# npm creates node_modules as a side effect
(bridge_dir / "node_modules").mkdir(exist_ok=True)
return MagicMock(returncode=0)
with patch("gateway.platforms.whatsapp.check_whatsapp_requirements", return_value=True), \
patch("aiohttp.ClientSession", _mock_health({"status": "disconnected"})), \
patch("gateway.platforms.whatsapp.asyncio.sleep", new_callable=AsyncMock), \
patch("gateway.platforms.whatsapp._kill_stale_bridge_by_pidfile"), \
patch("gateway.platforms.whatsapp._kill_port_process"), \
patch("subprocess.run", side_effect=_npm_install) as mock_run, \
patch("subprocess.Popen", return_value=mock_proc), \
patch.object(adapter, "_acquire_platform_lock", return_value=True, create=True):
await adapter.connect()
mock_run.assert_called_once()
class TestCacheDirEnvPassthrough:
@pytest.mark.asyncio
async def test_bridge_spawn_env_has_cache_dirs(self, tmp_path):
bridge_dir = _setup_bridge_dir(tmp_path)
_fresh_node_modules(bridge_dir)
adapter = _make_adapter(
bridge_script=str(bridge_dir / "bridge.js"),
session_path=tmp_path / "session",
)
mock_proc = MagicMock()
mock_proc.poll.return_value = 1
mock_proc.returncode = 1
with patch("gateway.platforms.whatsapp.check_whatsapp_requirements", return_value=True), \
patch("aiohttp.ClientSession", _mock_health({"status": "disconnected"})), \
patch("gateway.platforms.whatsapp.asyncio.sleep", new_callable=AsyncMock), \
patch("gateway.platforms.whatsapp._kill_stale_bridge_by_pidfile"), \
patch("gateway.platforms.whatsapp._kill_port_process"), \
patch("subprocess.Popen", return_value=mock_proc) as mock_popen, \
patch.object(adapter, "_acquire_platform_lock", return_value=True, create=True):
await adapter.connect()
env = mock_popen.call_args.kwargs["env"]
from gateway.platforms.base import (
get_audio_cache_dir,
get_document_cache_dir,
get_image_cache_dir,
)
assert env["HERMES_IMAGE_CACHE_DIR"] == str(get_image_cache_dir())
assert env["HERMES_AUDIO_CACHE_DIR"] == str(get_audio_cache_dir())
assert env["HERMES_DOCUMENT_CACHE_DIR"] == str(get_document_cache_dir())