mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(gateway): prevent --replace race condition causing multiple instances
When starting the gateway with --replace, concurrent invocations could leave multiple instances running simultaneously. This happened because write_pid_file() used a plain overwrite, so the second racer would silently replace the first process's PID record. Changes: - gateway/status.py: write_pid_file() now uses atomic O_CREAT|O_EXCL creation. If the file already exists, it raises FileExistsError, allowing exactly one process to win the race. - gateway/run.py: before writing the PID file, re-check get_running_pid() and catch FileExistsError from write_pid_file(). In both cases, stop the runner and return False so the process exits cleanly. Fixes #11718
This commit is contained in:
parent
328223576b
commit
cbe29db774
3 changed files with 42 additions and 4 deletions
|
|
@ -10956,8 +10956,25 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
|
||||
# Write PID file so CLI can detect gateway is running
|
||||
import atexit
|
||||
from gateway.status import write_pid_file, remove_pid_file
|
||||
write_pid_file()
|
||||
from gateway.status import write_pid_file, remove_pid_file, get_running_pid
|
||||
# Defensive re-check: another --replace racer may have started
|
||||
# while we were initializing. If so, yield and exit.
|
||||
_current_pid = get_running_pid()
|
||||
if _current_pid is not None and _current_pid != os.getpid():
|
||||
logger.error(
|
||||
"Another gateway instance (PID %d) started during our startup. "
|
||||
"Exiting to avoid double-running.", _current_pid
|
||||
)
|
||||
await runner.stop()
|
||||
return False
|
||||
try:
|
||||
write_pid_file()
|
||||
except FileExistsError:
|
||||
logger.error(
|
||||
"PID file race lost to another gateway instance. Exiting."
|
||||
)
|
||||
await runner.stop()
|
||||
return False
|
||||
atexit.register(remove_pid_file)
|
||||
|
||||
# Start background cron ticker so scheduled jobs fire automatically.
|
||||
|
|
|
|||
|
|
@ -225,8 +225,28 @@ def _cleanup_invalid_pid_path(pid_path: Path, *, cleanup_stale: bool) -> None:
|
|||
|
||||
|
||||
def write_pid_file() -> None:
|
||||
"""Write the current process PID and metadata to the gateway PID file."""
|
||||
_write_json_file(_get_pid_path(), _build_pid_record())
|
||||
"""Write the current process PID and metadata to the gateway PID file.
|
||||
|
||||
Uses atomic O_CREAT | O_EXCL creation so that concurrent --replace
|
||||
invocations race: exactly one process wins and the rest get
|
||||
FileExistsError.
|
||||
"""
|
||||
path = _get_pid_path()
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
record = json.dumps(_build_pid_record())
|
||||
try:
|
||||
fd = os.open(path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
||||
except FileExistsError:
|
||||
raise # Let caller decide: another gateway is racing us
|
||||
try:
|
||||
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
||||
f.write(record)
|
||||
except Exception:
|
||||
try:
|
||||
path.unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass
|
||||
raise
|
||||
|
||||
|
||||
def write_runtime_status(
|
||||
|
|
|
|||
|
|
@ -307,6 +307,7 @@ AUTHOR_MAP = {
|
|||
"anthhub@163.com": "anthhub",
|
||||
"shenuu@gmail.com": "shenuu",
|
||||
"xiayh17@gmail.com": "xiayh0107",
|
||||
"zhujianxyz@gmail.com": "opriz",
|
||||
"asurla@nvidia.com": "anniesurla",
|
||||
"limkuan24@gmail.com": "WideLee",
|
||||
"aviralarora002@gmail.com": "AviArora02-commits",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue