From cbe29db774ac933f0c2fe07d500ad5f73316b7f9 Mon Sep 17 00:00:00 2001 From: opriz Date: Sat, 18 Apr 2026 02:24:35 +0800 Subject: [PATCH] fix(gateway): prevent --replace race condition causing multiple instances When starting the gateway with --replace, concurrent invocations could leave multiple instances running simultaneously. This happened because write_pid_file() used a plain overwrite, so the second racer would silently replace the first process's PID record. Changes: - gateway/status.py: write_pid_file() now uses atomic O_CREAT|O_EXCL creation. If the file already exists, it raises FileExistsError, allowing exactly one process to win the race. - gateway/run.py: before writing the PID file, re-check get_running_pid() and catch FileExistsError from write_pid_file(). In both cases, stop the runner and return False so the process exits cleanly. Fixes #11718 --- gateway/run.py | 21 +++++++++++++++++++-- gateway/status.py | 24 ++++++++++++++++++++++-- scripts/release.py | 1 + 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 6ce409ff1..d3ee8d4a0 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -10956,8 +10956,25 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = # Write PID file so CLI can detect gateway is running import atexit - from gateway.status import write_pid_file, remove_pid_file - write_pid_file() + from gateway.status import write_pid_file, remove_pid_file, get_running_pid + # Defensive re-check: another --replace racer may have started + # while we were initializing. If so, yield and exit. + _current_pid = get_running_pid() + if _current_pid is not None and _current_pid != os.getpid(): + logger.error( + "Another gateway instance (PID %d) started during our startup. " + "Exiting to avoid double-running.", _current_pid + ) + await runner.stop() + return False + try: + write_pid_file() + except FileExistsError: + logger.error( + "PID file race lost to another gateway instance. Exiting." + ) + await runner.stop() + return False atexit.register(remove_pid_file) # Start background cron ticker so scheduled jobs fire automatically. diff --git a/gateway/status.py b/gateway/status.py index e1598e179..74763332c 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -225,8 +225,28 @@ def _cleanup_invalid_pid_path(pid_path: Path, *, cleanup_stale: bool) -> None: def write_pid_file() -> None: - """Write the current process PID and metadata to the gateway PID file.""" - _write_json_file(_get_pid_path(), _build_pid_record()) + """Write the current process PID and metadata to the gateway PID file. + + Uses atomic O_CREAT | O_EXCL creation so that concurrent --replace + invocations race: exactly one process wins and the rest get + FileExistsError. + """ + path = _get_pid_path() + path.parent.mkdir(parents=True, exist_ok=True) + record = json.dumps(_build_pid_record()) + try: + fd = os.open(path, os.O_CREAT | os.O_EXCL | os.O_WRONLY) + except FileExistsError: + raise # Let caller decide: another gateway is racing us + try: + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write(record) + except Exception: + try: + path.unlink(missing_ok=True) + except OSError: + pass + raise def write_runtime_status( diff --git a/scripts/release.py b/scripts/release.py index 1a5a1ea8a..efe32f236 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -307,6 +307,7 @@ AUTHOR_MAP = { "anthhub@163.com": "anthhub", "shenuu@gmail.com": "shenuu", "xiayh17@gmail.com": "xiayh0107", + "zhujianxyz@gmail.com": "opriz", "asurla@nvidia.com": "anniesurla", "limkuan24@gmail.com": "WideLee", "aviralarora002@gmail.com": "AviArora02-commits",