fix(gateway): prevent --replace race condition causing multiple instances

When starting the gateway with --replace, concurrent invocations could
leave multiple instances running simultaneously. This happened because
write_pid_file() used a plain overwrite, so the second racer would
silently replace the first process's PID record.

Changes:
- gateway/status.py: write_pid_file() now uses atomic O_CREAT|O_EXCL
  creation. If the file already exists, it raises FileExistsError,
  allowing exactly one process to win the race.
- gateway/run.py: before writing the PID file, re-check get_running_pid()
  and catch FileExistsError from write_pid_file(). In both cases, stop
  the runner and return False so the process exits cleanly.

Fixes #11718
This commit is contained in:
opriz 2026-04-18 02:24:35 +08:00 committed by Teknium
parent 328223576b
commit cbe29db774
3 changed files with 42 additions and 4 deletions

View file

@ -10956,8 +10956,25 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
# Write PID file so CLI can detect gateway is running
import atexit
from gateway.status import write_pid_file, remove_pid_file
write_pid_file()
from gateway.status import write_pid_file, remove_pid_file, get_running_pid
# Defensive re-check: another --replace racer may have started
# while we were initializing. If so, yield and exit.
_current_pid = get_running_pid()
if _current_pid is not None and _current_pid != os.getpid():
logger.error(
"Another gateway instance (PID %d) started during our startup. "
"Exiting to avoid double-running.", _current_pid
)
await runner.stop()
return False
try:
write_pid_file()
except FileExistsError:
logger.error(
"PID file race lost to another gateway instance. Exiting."
)
await runner.stop()
return False
atexit.register(remove_pid_file)
# Start background cron ticker so scheduled jobs fire automatically.

View file

@ -225,8 +225,28 @@ def _cleanup_invalid_pid_path(pid_path: Path, *, cleanup_stale: bool) -> None:
def write_pid_file() -> None:
"""Write the current process PID and metadata to the gateway PID file."""
_write_json_file(_get_pid_path(), _build_pid_record())
"""Write the current process PID and metadata to the gateway PID file.
Uses atomic O_CREAT | O_EXCL creation so that concurrent --replace
invocations race: exactly one process wins and the rest get
FileExistsError.
"""
path = _get_pid_path()
path.parent.mkdir(parents=True, exist_ok=True)
record = json.dumps(_build_pid_record())
try:
fd = os.open(path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
except FileExistsError:
raise # Let caller decide: another gateway is racing us
try:
with os.fdopen(fd, "w", encoding="utf-8") as f:
f.write(record)
except Exception:
try:
path.unlink(missing_ok=True)
except OSError:
pass
raise
def write_runtime_status(

View file

@ -307,6 +307,7 @@ AUTHOR_MAP = {
"anthhub@163.com": "anthhub",
"shenuu@gmail.com": "shenuu",
"xiayh17@gmail.com": "xiayh0107",
"zhujianxyz@gmail.com": "opriz",
"asurla@nvidia.com": "anniesurla",
"limkuan24@gmail.com": "WideLee",
"aviralarora002@gmail.com": "AviArora02-commits",