mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix(photon): recover degraded upstream stream
This commit is contained in:
parent
34bd6a0db5
commit
06cbc3bae9
6 changed files with 241 additions and 4 deletions
|
|
@ -91,7 +91,11 @@ _SIDECAR_DIR = Path(__file__).parent / "sidecar"
|
|||
_PHOTON_RETRYABLE_PATTERNS = (
|
||||
"internal sidecar error",
|
||||
"upstream connect error",
|
||||
"upstream unavailable",
|
||||
"connection dropped",
|
||||
"reset reason: overflow",
|
||||
"upstream_overflow",
|
||||
"upstream_unavailable",
|
||||
)
|
||||
|
||||
# Minimum seconds between typing-indicator calls for the same chat.
|
||||
|
|
@ -235,8 +239,10 @@ class PhotonAdapter(BasePlatformAdapter):
|
|||
self._sidecar_proc: Optional[subprocess.Popen] = None
|
||||
self._sidecar_supervisor_task: Optional[asyncio.Task] = None
|
||||
self._inbound_task: Optional[asyncio.Task] = None
|
||||
self._sidecar_health_task: Optional[asyncio.Task] = None
|
||||
self._inbound_running = False
|
||||
self._http_client: Optional["httpx.AsyncClient"] = None
|
||||
self._sidecar_health_interval = 15.0
|
||||
# Lightweight in-memory dedup. The gRPC stream is at-least-once, so we
|
||||
# may see the same messageId more than once (e.g. after a reconnect).
|
||||
self._seen_messages: Dict[str, float] = {}
|
||||
|
|
@ -370,6 +376,9 @@ class PhotonAdapter(BasePlatformAdapter):
|
|||
self._inbound_task = asyncio.get_event_loop().create_task(
|
||||
self._inbound_loop()
|
||||
)
|
||||
self._sidecar_health_task = asyncio.get_event_loop().create_task(
|
||||
self._monitor_sidecar_health()
|
||||
)
|
||||
|
||||
self._mark_connected()
|
||||
logger.info(
|
||||
|
|
@ -380,6 +389,17 @@ class PhotonAdapter(BasePlatformAdapter):
|
|||
|
||||
async def disconnect(self) -> None:
|
||||
self._inbound_running = False
|
||||
if self._sidecar_health_task is not None:
|
||||
task = self._sidecar_health_task
|
||||
self._sidecar_health_task = None
|
||||
task.cancel()
|
||||
if task is not asyncio.current_task():
|
||||
try:
|
||||
await task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
if self._inbound_task is not None:
|
||||
self._inbound_task.cancel()
|
||||
try:
|
||||
|
|
@ -440,6 +460,49 @@ class PhotonAdapter(BasePlatformAdapter):
|
|||
await asyncio.sleep(backoff)
|
||||
backoff = min(backoff * 2, 30.0)
|
||||
|
||||
async def _monitor_sidecar_health(self) -> None:
|
||||
"""Promote degraded upstream Photon stream health into reconnect.
|
||||
|
||||
The sidecar HTTP process can stay alive while spectrum-ts repeatedly
|
||||
fails to maintain the upstream inbound gRPC stream. Polling `/healthz`
|
||||
keeps that from becoming a silent inbound outage.
|
||||
"""
|
||||
while self._inbound_running:
|
||||
await asyncio.sleep(self._sidecar_health_interval)
|
||||
if not self._inbound_running:
|
||||
break
|
||||
try:
|
||||
data = await self._sidecar_call("/healthz", {})
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.debug("[photon] sidecar health check failed: %s", exc)
|
||||
continue
|
||||
|
||||
stream = data.get("stream") if isinstance(data, dict) else None
|
||||
if not isinstance(stream, dict) or stream.get("ok") is not False:
|
||||
continue
|
||||
|
||||
state = str(stream.get("state") or "unknown")
|
||||
degraded_for_ms = stream.get("degradedForMs")
|
||||
last_issue = str(stream.get("lastIssue") or "unknown stream issue")
|
||||
message = (
|
||||
"Photon upstream stream degraded"
|
||||
f" (state={state}, degradedForMs={degraded_for_ms}): "
|
||||
f"{last_issue}"
|
||||
)
|
||||
logger.error("[photon] %s", message)
|
||||
self._set_fatal_error(
|
||||
"UPSTREAM_STREAM_DEGRADED",
|
||||
message,
|
||||
retryable=True,
|
||||
)
|
||||
try:
|
||||
await self._notify_fatal_error()
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
logger.warning("[photon] fatal-error notification failed: %s", exc)
|
||||
break
|
||||
|
||||
async def _on_inbound_line(self, line: str) -> None:
|
||||
try:
|
||||
event = json.loads(line)
|
||||
|
|
|
|||
|
|
@ -80,6 +80,109 @@ const E164_RE = /^\+\d{6,}$/;
|
|||
const MAX_KNOWN_SPACES = 2048;
|
||||
const MAX_KNOWN_MESSAGES = 1024;
|
||||
const MAX_REACTION_HANDLES = 512;
|
||||
const STREAM_DEGRADED_RESTART_MS =
|
||||
Number(process.env.PHOTON_STREAM_DEGRADED_RESTART_MS) || 90 * 1000;
|
||||
const STREAM_INTERRUPTED_DEGRADE_COUNT =
|
||||
Number(process.env.PHOTON_STREAM_INTERRUPTED_DEGRADE_COUNT) || 3;
|
||||
|
||||
const streamHealth = {
|
||||
state: "starting",
|
||||
degradedSince: null,
|
||||
lastHealthyAt: null,
|
||||
lastIssueAt: null,
|
||||
lastIssue: null,
|
||||
issueCount: 0,
|
||||
};
|
||||
let streamRestartTimer = null;
|
||||
|
||||
function streamHealthSnapshot() {
|
||||
const now = Date.now();
|
||||
const degradedForMs =
|
||||
streamHealth.degradedSince === null ? 0 : now - streamHealth.degradedSince;
|
||||
return {
|
||||
ok: streamHealth.state !== "degraded",
|
||||
state: streamHealth.state,
|
||||
degradedForMs,
|
||||
restartAfterMs: STREAM_DEGRADED_RESTART_MS,
|
||||
lastHealthyAt: streamHealth.lastHealthyAt,
|
||||
lastIssueAt: streamHealth.lastIssueAt,
|
||||
lastIssue: streamHealth.lastIssue,
|
||||
issueCount: streamHealth.issueCount,
|
||||
};
|
||||
}
|
||||
|
||||
function markStreamHealthy() {
|
||||
streamHealth.state = "healthy";
|
||||
streamHealth.degradedSince = null;
|
||||
streamHealth.lastHealthyAt = new Date().toISOString();
|
||||
streamHealth.issueCount = 0;
|
||||
if (streamRestartTimer) {
|
||||
clearTimeout(streamRestartTimer);
|
||||
streamRestartTimer = null;
|
||||
}
|
||||
}
|
||||
|
||||
function scheduleStreamRestart() {
|
||||
if (STREAM_DEGRADED_RESTART_MS <= 0 || streamRestartTimer) return;
|
||||
streamRestartTimer = setTimeout(() => {
|
||||
streamRestartTimer = null;
|
||||
if (streamHealth.state !== "degraded" || streamHealth.degradedSince === null) {
|
||||
return;
|
||||
}
|
||||
const degradedForMs = Date.now() - streamHealth.degradedSince;
|
||||
if (degradedForMs < STREAM_DEGRADED_RESTART_MS) {
|
||||
scheduleStreamRestart();
|
||||
return;
|
||||
}
|
||||
console.error(
|
||||
`photon-sidecar: upstream stream degraded for ${degradedForMs}ms; ` +
|
||||
"exiting so Hermes can restart the Photon adapter"
|
||||
);
|
||||
process.exit(75);
|
||||
}, STREAM_DEGRADED_RESTART_MS + 1000);
|
||||
streamRestartTimer.unref();
|
||||
}
|
||||
|
||||
function markStreamDegraded(reason) {
|
||||
const now = Date.now();
|
||||
if (streamHealth.state !== "degraded") {
|
||||
streamHealth.degradedSince = now;
|
||||
}
|
||||
streamHealth.state = "degraded";
|
||||
streamHealth.lastIssueAt = new Date(now).toISOString();
|
||||
streamHealth.lastIssue = reason;
|
||||
streamHealth.issueCount += 1;
|
||||
scheduleStreamRestart();
|
||||
}
|
||||
|
||||
function markStreamRecovering(reason) {
|
||||
if (streamHealth.state !== "recovering") {
|
||||
streamHealth.issueCount = 0;
|
||||
}
|
||||
streamHealth.state = "recovering";
|
||||
streamHealth.lastIssueAt = new Date().toISOString();
|
||||
streamHealth.lastIssue = reason;
|
||||
streamHealth.issueCount += 1;
|
||||
if (streamHealth.issueCount >= STREAM_INTERRUPTED_DEGRADE_COUNT) {
|
||||
markStreamDegraded(reason);
|
||||
}
|
||||
}
|
||||
|
||||
const originalConsoleError = console.error.bind(console);
|
||||
console.error = (...args) => {
|
||||
const text = args
|
||||
.map((arg) => (arg && arg.stack ? arg.stack : String(arg)))
|
||||
.join(" ");
|
||||
if (text.includes("[spectrum.stream]")) {
|
||||
const reason = text.split("\n", 1)[0];
|
||||
if (text.includes("persistently failing")) {
|
||||
markStreamDegraded(reason);
|
||||
} else if (text.includes("stream interrupted")) {
|
||||
markStreamRecovering(reason);
|
||||
}
|
||||
}
|
||||
originalConsoleError(...args);
|
||||
};
|
||||
|
||||
if (!projectId || !projectSecret || !sharedToken) {
|
||||
console.error(
|
||||
|
|
@ -353,6 +456,7 @@ async function normalizeEvent(space, message) {
|
|||
try {
|
||||
for await (const [space, message] of app.messages) {
|
||||
backoff = 1000; // healthy traffic — reset
|
||||
markStreamHealthy();
|
||||
// Only forward inbound messages (ignore our own outbound echoes).
|
||||
if (message && message.direction && message.direction !== "inbound") {
|
||||
continue;
|
||||
|
|
@ -364,11 +468,14 @@ async function normalizeEvent(space, message) {
|
|||
await deliver(JSON.stringify(event));
|
||||
}
|
||||
console.error("photon-sidecar: inbound stream ended — re-subscribing");
|
||||
markStreamRecovering("inbound stream ended");
|
||||
} catch (e) {
|
||||
const reason = e && e.message ? e.message : String(e);
|
||||
console.error(
|
||||
"photon-sidecar: inbound stream errored — restarting: " +
|
||||
(e && e.message ? e.message : String(e))
|
||||
reason
|
||||
);
|
||||
markStreamRecovering(reason);
|
||||
}
|
||||
await new Promise((r) =>
|
||||
setTimeout(r, backoff + Math.random() * backoff * 0.2)
|
||||
|
|
@ -530,7 +637,7 @@ const server = http.createServer(async (req, res) => {
|
|||
}
|
||||
try {
|
||||
if (req.url === "/healthz") {
|
||||
return ok(res, {});
|
||||
return ok(res, { stream: streamHealthSnapshot() });
|
||||
}
|
||||
if (req.url === "/shutdown") {
|
||||
ok(res, {});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue