fix(transcription): fall back to CPU when CUDA runtime libs are missing

faster-whisper's device="auto" picks CUDA when ctranslate2's wheel ships CUDA shared libs, even on hosts without the NVIDIA runtime (libcublas.so.12 / libcudnn*). On those hosts the model often loads fine but transcribe() fails at first dlopen, and the broken model stays cached in the module-global — every subsequent voice message in the gateway process fails identically until restart. - Add _load_local_whisper_model() wrapper: try auto, catch missing-lib errors, retry on device=cpu compute_type=int8. - Wrap transcribe() with the same fallback: evict cached model, reload on CPU, retry once. Required because the dlopen failure only surfaces at first kernel launch, not at model construction. - Narrow marker list (libcublas, libcudnn, libcudart, 'cannot be loaded', 'no kernel image is available', 'no CUDA-capable device', driver mismatch). Deliberately excludes 'CUDA out of memory' and similar — those are real runtime failures that should surface, not be silently retried on CPU. - Tests for load-time fallback, runtime fallback (with cached-model eviction verified), and the OOM non-fallback path. Reported via Telegram voice-message dumps on WSL2 hosts where libcublas isn't installed by default.
2026-04-25 00:51:20 +00:00 · 2026-04-24 02:49:18 -07:00 · 2026-04-24 02:49:18 -07:00 · 4350668ae4
commit 4350668ae4
parent 34c3e67109
2 changed files with 179 additions and 4 deletions
--- a/tests/tools/test_transcription_tools.py
+++ b/tests/tools/test_transcription_tools.py
@ -505,6 +505,101 @@ class TestTranscribeLocalExtended:
        assert result["success"] is True
        assert result["transcript"] == "Hello world"
    def test_load_time_cuda_lib_failure_falls_back_to_cpu(self, tmp_path):
        """Missing libcublas at load time → reload on CPU, succeed."""
        audio = tmp_path / "test.ogg"
        audio.write_bytes(b"fake")
        seg = MagicMock()
        seg.text = "hi"
        info = MagicMock()
        info.language = "en"
        info.duration = 1.0
        cpu_model = MagicMock()
        cpu_model.transcribe.return_value = ([seg], info)
        call_args = []
        def fake_whisper(model_name, device, compute_type):
            call_args.append((device, compute_type))
            if device == "auto":
                raise RuntimeError("Library libcublas.so.12 is not found or cannot be loaded")
            return cpu_model
        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
             patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
             patch("tools.transcription_tools._local_model", None), \
             patch("tools.transcription_tools._local_model_name", None):
            from tools.transcription_tools import _transcribe_local
            result = _transcribe_local(str(audio), "base")
        assert result["success"] is True
        assert result["transcript"] == "hi"
        assert call_args == [("auto", "auto"), ("cpu", "int8")]
    def test_runtime_cuda_lib_failure_evicts_cache_and_retries_on_cpu(self, tmp_path):
        """libcublas dlopen fails at transcribe() → evict cache, reload CPU, retry."""
        audio = tmp_path / "test.ogg"
        audio.write_bytes(b"fake")
        seg = MagicMock()
        seg.text = "recovered"
        info = MagicMock()
        info.language = "en"
        info.duration = 1.0
        # First model loads fine (auto), but transcribe() blows up on dlopen
        gpu_model = MagicMock()
        gpu_model.transcribe.side_effect = RuntimeError(
            "Library libcublas.so.12 is not found or cannot be loaded"
        )
        # Second model (forced CPU) works
        cpu_model = MagicMock()
        cpu_model.transcribe.return_value = ([seg], info)
        models = [gpu_model, cpu_model]
        call_args = []
        def fake_whisper(model_name, device, compute_type):
            call_args.append((device, compute_type))
            return models.pop(0)
        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
             patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
             patch("tools.transcription_tools._local_model", None), \
             patch("tools.transcription_tools._local_model_name", None):
            from tools.transcription_tools import _transcribe_local
            result = _transcribe_local(str(audio), "base")
        assert result["success"] is True
        assert result["transcript"] == "recovered"
        # First load is auto, retry forces CPU.
        assert call_args == [("auto", "auto"), ("cpu", "int8")]
        # Cached-bad-model eviction: the broken GPU model was called once,
        # then discarded; the CPU model served the retry.
        assert gpu_model.transcribe.call_count == 1
        assert cpu_model.transcribe.call_count == 1
    def test_cuda_out_of_memory_does_not_trigger_cpu_fallback(self, tmp_path):
        """'CUDA out of memory' is a real error, not a missing lib — surface it."""
        audio = tmp_path / "test.ogg"
        audio.write_bytes(b"fake")
        mock_whisper_cls = MagicMock(side_effect=RuntimeError("CUDA out of memory"))
        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
             patch("faster_whisper.WhisperModel", mock_whisper_cls), \
             patch("tools.transcription_tools._local_model", None), \
             patch("tools.transcription_tools._local_model_name", None):
            from tools.transcription_tools import _transcribe_local
            result = _transcribe_local(str(audio), "base")
        # Single call — no CPU retry, because OOM isn't a missing-lib symptom.
        assert mock_whisper_cls.call_count == 1
        assert result["success"] is False
        assert "CUDA out of memory" in result["error"]
 # ============================================================================
 # Model auto-correction
--- a/tools/transcription_tools.py
+++ b/tools/transcription_tools.py
@ -313,6 +313,66 @@ def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]:
 # ---------------------------------------------------------------------------
 # Substrings that identify a missing/unloadable CUDA runtime library.  When
 # ctranslate2 (the backend for faster-whisper) cannot dlopen one of these, the
 # "auto" device picker has already committed to CUDA and the model can no
 # longer be used — we fall back to CPU and reload.
 #
 # Deliberately narrow: we match on library-name tokens and dlopen phrasing so
 # we DO NOT accidentally catch legitimate runtime failures like "CUDA out of
 # memory" — those should surface to the user, not silently fall back to CPU
 # (a 32GB audio clip on CPU at int8 isn't useful either).
 _CUDA_LIB_ERROR_MARKERS = (
    "libcublas",
    "libcudnn",
    "libcudart",
    "cannot be loaded",
    "cannot open shared object",
    "no kernel image is available",
    "no CUDA-capable device",
    "CUDA driver version is insufficient",
 )
 def _looks_like_cuda_lib_error(exc: BaseException) -> bool:
    """Heuristic: is this exception a missing/broken CUDA runtime library?
    ctranslate2 raises plain RuntimeError with messages like
    ``Library libcublas.so.12 is not found or cannot be loaded``.  We want to
    catch missing/unloadable shared libs and driver-mismatch errors, NOT
    legitimate runtime failures ("CUDA out of memory", model bugs, etc.).
    """
    msg = str(exc)
    return any(marker in msg for marker in _CUDA_LIB_ERROR_MARKERS)
 def _load_local_whisper_model(model_name: str):
    """Load faster-whisper with graceful CUDA → CPU fallback.
    faster-whisper's ``device="auto"`` picks CUDA when the ctranslate2 wheel
    ships CUDA shared libs, even on hosts where the NVIDIA runtime
    (``libcublas.so.12`` / ``libcudnn*``) isn't installed — common on WSL2
    without CUDA-on-WSL, headless servers, and CPU-only developer machines.
    On those hosts the load itself sometimes succeeds and the dlopen failure
    only surfaces at first ``transcribe()`` call.
    We try ``auto`` first (fast CUDA path when it works), and on any CUDA
    library load failure fall back to CPU + int8.
    """
    from faster_whisper import WhisperModel
    try:
        return WhisperModel(model_name, device="auto", compute_type="auto")
    except Exception as exc:
        if not _looks_like_cuda_lib_error(exc):
            raise
        logger.warning(
            "faster-whisper CUDA load failed (%s) — falling back to CPU (int8). "
            "Install the NVIDIA CUDA runtime (libcublas/libcudnn) to use GPU.",
            exc,
        )
        return WhisperModel(model_name, device="cpu", compute_type="int8")
 def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
    """Transcribe using faster-whisper (local, free)."""
    global _local_model, _local_model_name
@ -321,11 +381,10 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
        return {"success": False, "transcript": "", "error": "faster-whisper not installed"}
    try:
        from faster_whisper import WhisperModel
        # Lazy-load the model (downloads on first use, ~150 MB for 'base')
        if _local_model is None or _local_model_name != model_name:
            logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name)
-            _local_model = WhisperModel(model_name, device="auto", compute_type="auto")
+            _local_model = _load_local_whisper_model(model_name)
            _local_model_name = model_name
        # Language: config.yaml (stt.local.language) > env var > auto-detect.
@ -338,6 +397,27 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
        if _forced_lang:
            transcribe_kwargs["language"] = _forced_lang
        try:
            segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
            transcript = " ".join(segment.text.strip() for segment in segments)
        except Exception as exc:
            # CUDA runtime libs sometimes only fail at dlopen-on-first-use,
            # AFTER the model loaded successfully.  Evict the broken cached
            # model, reload on CPU, retry once.  Without this the module-
            # global `_local_model` is poisoned and every subsequent voice
            # message on this process fails identically until restart.
            if not _looks_like_cuda_lib_error(exc):
                raise
            logger.warning(
                "faster-whisper CUDA runtime failed mid-transcribe (%s) — "
                "evicting cached model and retrying on CPU (int8).",
                exc,
            )
            _local_model = None
            _local_model_name = None
            from faster_whisper import WhisperModel
            _local_model = WhisperModel(model_name, device="cpu", compute_type="int8")
            _local_model_name = model_name
            segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
            transcript = " ".join(segment.text.strip() for segment in segments)