diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index e65965869d7..c0e69bcf3d1 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -80,6 +80,56 @@ jobs: with: image: ${{ env.IMAGE_NAME }}:test + # --------------------------------------------------------------------- + # Run the docker-integration test suite against the freshly-built + # image already loaded into the local daemon (`:test`). These tests + # are excluded from the sharded `tests.yml :: test` matrix on purpose + # (see `_SKIP_PARTS` in scripts/run_tests_parallel.py) because each + # shard would otherwise reach the session-scoped ``built_image`` + # fixture in ``tests/docker/conftest.py`` and start a 3-7min + # ``docker build`` under a 180s pytest-timeout cap — guaranteed to + # die in fixture setup. + # + # Piggybacking here avoids a second image build: the smoke test + # already proved the image loads + runs, so the daemon has it under + # `${IMAGE_NAME}:test` and we just point ``HERMES_TEST_IMAGE`` at + # that. The fixture's ``HERMES_TEST_IMAGE`` branch (see + # tests/docker/conftest.py:62-63) short-circuits the rebuild. + # + # Why this job and not a standalone one: the image is 5GB+; passing + # it between jobs via ``docker save``/``upload-artifact`` is slower + # than the build itself. Reusing the existing daemon state is the + # cheapest path to coverage on every PR that touches docker code. + # --------------------------------------------------------------------- + - name: Install uv (for docker tests) + uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5 + + - name: Set up Python 3.11 (for docker tests) + run: uv python install 3.11 + + - name: Install Python dependencies (for docker tests) + run: | + uv venv .venv --python 3.11 + source .venv/bin/activate + # ``dev`` extra pulls in pytest, pytest-asyncio, pytest-timeout — + # everything tests/docker/ needs. We deliberately avoid ``all`` + # here because the docker tests only drive the container via + # subprocess and don't import hermes_agent's optional deps. + uv pip install -e ".[dev]" + + - name: Run docker integration tests + env: + # Skip rebuild; use the image already loaded by the build step. + HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test + # Match the policy in tests.yml :: test job — no accidental + # real-API calls from inside the harness. + OPENROUTER_API_KEY: "" + OPENAI_API_KEY: "" + NOUS_API_KEY: "" + run: | + source .venv/bin/activate + python -m pytest tests/docker/ -v --tb=short + - name: Log in to Docker Hub if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release' uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 diff --git a/scripts/run_tests_parallel.py b/scripts/run_tests_parallel.py index 57178899012..7fe0b57947a 100755 --- a/scripts/run_tests_parallel.py +++ b/scripts/run_tests_parallel.py @@ -52,10 +52,24 @@ from typing import Dict, List, Tuple # Default test discovery roots. _DEFAULT_ROOTS = ["tests"] -# Directories to skip during discovery — the e2e + integration suites -# require real services and are run separately. Match exactly the -# ``--ignore=`` flags the previous CI command used. -_SKIP_PARTS = {"integration", "e2e"} +# Directories to skip during discovery — these suites require real +# external services (a model gateway, a docker daemon with a prebuilt +# image, etc.) and are run in their own dedicated CI jobs: +# +# tests/e2e/ — .github/workflows/tests.yml :: e2e job +# tests/integration/ — historical; legacy --ignore flags +# tests/docker/ — .github/workflows/docker-publish.yml :: +# build-amd64 job (runs against the freshly-loaded +# nousresearch/hermes-agent:test image, via +# ``HERMES_TEST_IMAGE`` so the fixture skips +# rebuild). The full pytest-shard runner can't +# host these because the session-scoped +# ``built_image`` fixture would do a 3-7min +# ``docker build`` inside a 180s per-test +# pytest-timeout cap (set by tests/docker/conftest.py), +# so the build is guaranteed to die in fixture +# setup. The dedicated job sidesteps both costs. +_SKIP_PARTS = {"integration", "e2e", "docker"} # Per-file wall-clock cap. Generous default — pytest-timeout still # enforces per-test caps inside each subprocess; this is just an outer @@ -136,7 +150,10 @@ def _discover_files(roots: List[Path]) -> List[Path]: Exclude any file whose path contains a component in ``_SKIP_PARTS``, UNLESS the user explicitly named it as a root (in which case the - user's intent overrides the skip filter). + user's intent overrides the skip filter). This makes + ``scripts/run_tests.sh tests/docker/`` work locally the same way + ``pytest tests/docker/`` does — the CI-level skip exists to keep + the sharded matrix from blowing up, not to block targeted runs. """ seen: set[Path] = set() out: List[Path] = [] @@ -151,8 +168,17 @@ def _discover_files(roots: List[Path]) -> List[Path]: seen.add(real) out.append(root) continue + # If the explicit root itself sits inside a skipped dir (e.g. + # the user said ``tests/docker``), the user has overridden the + # skip for that subtree. Compute the set of skip-parts the user + # opted into, and only filter files whose path crosses a + # skip-part *outside* that opt-in. + root_skip_overrides = { + part for part in root.parts if part in _SKIP_PARTS + } + effective_skips = _SKIP_PARTS - root_skip_overrides for path in root.rglob("test_*.py"): - if any(part in _SKIP_PARTS for part in path.parts): + if any(part in effective_skips for part in path.parts): continue real = path.resolve() if real in seen: