diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml index 9cb3171aec6..b6590f0a010 100644 --- a/.github/workflows/nix.yml +++ b/.github/workflows/nix.yml @@ -37,23 +37,16 @@ jobs: - name: Check flake id: flake - if: runner.os == 'Linux' continue-on-error: true run: nix flake check --print-build-logs - - name: Build package - id: build - if: runner.os == 'Linux' - continue-on-error: true - run: nix build --print-build-logs - - # When the real Nix build fails, run a targeted diagnostic to see if + # When the flake check fails, run a targeted diagnostic to see if # the failure is specifically a stale npm lockfile hash in one of the # known npm subpackages (tui / web). This avoids surfacing a generic # "build failed" message when the fix is a single known command. - name: Diagnose npm lockfile hashes id: hash_check - if: (steps.flake.outcome == 'failure' || steps.build.outcome == 'failure') && runner.os == 'Linux' + if: steps.flake.outcome == 'failure' && runner.os == 'Linux' continue-on-error: true env: LINK_SHA: ${{ steps.sha.outputs.full }} @@ -88,30 +81,25 @@ jobs: - Or [run the Nix Lockfile Fix workflow](${{ github.server_url }}/${{ github.repository }}/actions/workflows/nix-lockfile-fix.yml) manually (pass PR `#${{ github.event.pull_request.number }}`) - Or locally: `nix run .#fix-lockfiles` and commit the diff - # Clear the sticky comment when either the build passed outright (no + # Clear the sticky comment when either the flake check passed outright (no # hash check needed) or the hash check explicitly returned stale=false - # (build failed for a non-hash reason). + # (check failed for a non-hash reason). - name: Clear sticky PR comment (resolved) if: | github.event_name == 'pull_request' && - runner.os == 'Linux' && (steps.hash_check.outputs.stale == 'false' || - (steps.flake.outcome == 'success' && steps.build.outcome == 'success')) + steps.flake.outcome == 'success') uses: marocchino/sticky-pull-request-comment@52423e01640425a022ef5fd42c6fb5f633a02728 # v2.9.1 with: header: nix-lockfile-check delete: true - - name: Final fail if build or flake failed - if: steps.flake.outcome == 'failure' || steps.build.outcome == 'failure' + - name: Final fail if flake check failed + if: steps.flake.outcome == 'failure' run: | if [ "${{ steps.hash_check.outputs.stale }}" == "true" ]; then echo "::error::Nix build failed due to stale npm lockfile hash. Run: nix run .#fix-lockfiles" else - echo "::error::Nix build/flake check failed. See logs above." + echo "::error::Nix flake check failed. See logs above." fi exit 1 - - - name: Evaluate flake (macOS) - if: runner.os == 'macOS' - run: nix flake show --json > /dev/null diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index baf1dbcc386..7842465a525 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -9,6 +9,8 @@ Usage: python -m hermes_cli.main web --port 8080 """ +from contextlib import asynccontextmanager + import asyncio import base64 import binascii @@ -84,7 +86,43 @@ except ImportError: WEB_DIST = Path(os.environ["HERMES_WEB_DIST"]) if "HERMES_WEB_DIST" in os.environ else Path(__file__).parent / "web_dist" _log = logging.getLogger(__name__) -app = FastAPI(title="Hermes Agent", version=__version__) +# --------------------------------------------------------------------------- +# Per-channel subscriber registry used by /api/pub (PTY-side gateway → dashboard) +# and /api/events (dashboard → browser sidebar). Keyed by an opaque channel id +# the chat tab generates on mount; entries auto-evict when the last subscriber +# drops AND the publisher has disconnected. +# +# State lives on app.state (not module-level globals) so that asyncio.Lock is +# created on the running event loop during lifespan startup. A module-level +# asyncio.Lock() binds to whatever loop was active at import time, which breaks +# when the same module is used across TestClient instances or uvicorn reloads. +# --------------------------------------------------------------------------- + +@asynccontextmanager +async def _lifespan(app: "FastAPI"): + app.state.event_channels = {} # dict[str, set] + app.state.event_lock = asyncio.Lock() + yield + + +def _get_event_state(app: "FastAPI"): + """Return (event_channels, event_lock) from app.state. + + Lazily initialises the state if the lifespan hasn't run (e.g. when + TestClient is constructed without a ``with`` block). The lifespan + path is preferred because it guarantees the Lock is created on the + correct event loop, but the lazy path lets existing non-``with`` + TestClient usages keep working. + """ + try: + return app.state.event_channels, app.state.event_lock + except AttributeError: + app.state.event_channels = {} + app.state.event_lock = asyncio.Lock() + return app.state.event_channels, app.state.event_lock + + +app = FastAPI(title="Hermes Agent", version=__version__, lifespan=_lifespan) # --------------------------------------------------------------------------- # Session token for protecting sensitive endpoints (reveal). @@ -6631,8 +6669,7 @@ def _ws_auth_ok(ws: "WebSocket") -> bool: # and /api/events (dashboard → browser sidebar). Keyed by an opaque channel id # the chat tab generates on mount; entries auto-evict when the last subscriber # drops AND the publisher has disconnected. -_event_channels: dict[str, set] = {} -_event_lock = asyncio.Lock() +# (State is initialised in _lifespan on app startup — see above.) def _resolve_chat_argv( @@ -6741,10 +6778,11 @@ def _build_sidecar_url(channel: str) -> Optional[str]: return f"ws://{netloc}/api/pub?{qs}" -async def _broadcast_event(channel: str, payload: str) -> None: +async def _broadcast_event(app: Any, channel: str, payload: str) -> None: """Fan out one publisher frame to every subscriber on `channel`.""" - async with _event_lock: - subs = list(_event_channels.get(channel, ())) + event_channels, event_lock = _get_event_state(app) + async with event_lock: + subs = list(event_channels.get(channel, ())) for sub in subs: try: @@ -6935,7 +6973,7 @@ async def pub_ws(ws: WebSocket) -> None: try: while True: - await _broadcast_event(channel, await ws.receive_text()) + await _broadcast_event(ws.app, channel, await ws.receive_text()) except WebSocketDisconnect: pass @@ -6961,8 +6999,9 @@ async def events_ws(ws: WebSocket) -> None: await ws.accept() - async with _event_lock: - _event_channels.setdefault(channel, set()).add(ws) + event_channels, event_lock = _get_event_state(ws.app) + async with event_lock: + event_channels.setdefault(channel, set()).add(ws) try: while True: @@ -6973,14 +7012,14 @@ async def events_ws(ws: WebSocket) -> None: except WebSocketDisconnect: pass finally: - async with _event_lock: - subs = _event_channels.get(channel) + async with event_lock: + subs = event_channels.get(channel) if subs is not None: subs.discard(ws) if not subs: - _event_channels.pop(channel, None) + event_channels.pop(channel, None) def _normalise_prefix(raw: Optional[str]) -> str: diff --git a/nix/checks.nix b/nix/checks.nix index e847ef26cbd..63ec1eb672b 100644 --- a/nix/checks.nix +++ b/nix/checks.nix @@ -58,6 +58,22 @@ json.dump(sorted(leaf_paths(DEFAULT_CONFIG)), sys.stdout, indent=2) echo "ok" > $out/result '' ); + + # Verify the default package builds successfully (cross-platform). + # On Linux the runtime checks below already depend on the package, + # but this ensures darwin builders also build it during flake check. + build-package = pkgs.runCommand "hermes-build-package" { } '' + echo "PASS: package built at ${hermes-agent}" + mkdir -p $out + echo "ok" > $out/result + ''; + + # Verify the devShell builds successfully (cross-platform). + build-devshell = pkgs.runCommand "hermes-build-devshell" { } '' + echo "PASS: devShell built at ${self'.devShells.default}" + mkdir -p $out + echo "ok" > $out/result + ''; } // lib.optionalAttrs pkgs.stdenv.hostPlatform.isLinux { # Verify binaries exist and are executable package-contents = pkgs.runCommand "hermes-package-contents" { } '' diff --git a/tests/hermes_cli/test_web_server.py b/tests/hermes_cli/test_web_server.py index 06079aed371..323f06c5a31 100644 --- a/tests/hermes_cli/test_web_server.py +++ b/tests/hermes_cli/test_web_server.py @@ -3415,7 +3415,7 @@ class TestPtyWebSocket: # subscriber registration and the message is dropped. deadline = time.monotonic() + 5.0 while time.monotonic() < deadline: - if ws_mod._event_channels.get("broadcast-test"): + if ws_mod.app.state.event_channels.get("broadcast-test"): break time.sleep(0.01) else: