fix(transcription): fall back to CPU when CUDA runtime libs are missing

faster-whisper's device="auto" picks CUDA when ctranslate2's wheel ships CUDA shared libs, even on hosts without the NVIDIA runtime (libcublas.so.12 / libcudnn*). On those hosts the model often loads fine but transcribe() fails at first dlopen, and the broken model stays cached in the module-global — every subsequent voice message in the gateway process fails identically until restart. - Add _load_local_whisper_model() wrapper: try auto, catch missing-lib errors, retry on device=cpu compute_type=int8. - Wrap transcribe() with the same fallback: evict cached model, reload on CPU, retry once. Required because the dlopen failure only surfaces at first kernel launch, not at model construction. - Narrow marker list (libcublas, libcudnn, libcudart, 'cannot be loaded', 'no kernel image is available', 'no CUDA-capable device', driver mismatch). Deliberately excludes 'CUDA out of memory' and similar — those are real runtime failures that should surface, not be silently retried on CPU. - Tests for load-time fallback, runtime fallback (with cached-model eviction verified), and the OOM non-fallback path. Reported via Telegram voice-message dumps on WSL2 hosts where libcublas isn't installed by default.
2026-04-25 00:51:20 +00:00 · 2026-04-24 02:49:18 -07:00 · 2026-04-24 02:49:18 -07:00 · 4350668ae4
commit 4350668ae4
parent 34c3e67109
2 changed files with 179 additions and 4 deletions
--- a/tests/tools/test_transcription_tools.py
+++ b/tests/tools/test_transcription_tools.py
@ -505,6 +505,101 @@ class TestTranscribeLocalExtended:
        assert result["success"] is True
        assert result["transcript"] == "Hello world"

+    def test_load_time_cuda_lib_failure_falls_back_to_cpu(self, tmp_path):
+        """Missing libcublas at load time → reload on CPU, succeed."""
+        audio = tmp_path / "test.ogg"
+        audio.write_bytes(b"fake")
+
+        seg = MagicMock()
+        seg.text = "hi"
+        info = MagicMock()
+        info.language = "en"
+        info.duration = 1.0
+
+        cpu_model = MagicMock()
+        cpu_model.transcribe.return_value = ([seg], info)
+
+        call_args = []
+
+        def fake_whisper(model_name, device, compute_type):
+            call_args.append((device, compute_type))
+            if device == "auto":
+                raise RuntimeError("Library libcublas.so.12 is not found or cannot be loaded")
+            return cpu_model
+
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
+             patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
+             patch("tools.transcription_tools._local_model", None), \
+             patch("tools.transcription_tools._local_model_name", None):
+            from tools.transcription_tools import _transcribe_local
+            result = _transcribe_local(str(audio), "base")
+
+        assert result["success"] is True
+        assert result["transcript"] == "hi"
+        assert call_args == [("auto", "auto"), ("cpu", "int8")]
+
+    def test_runtime_cuda_lib_failure_evicts_cache_and_retries_on_cpu(self, tmp_path):
+        """libcublas dlopen fails at transcribe() → evict cache, reload CPU, retry."""
+        audio = tmp_path / "test.ogg"
+        audio.write_bytes(b"fake")
+
+        seg = MagicMock()
+        seg.text = "recovered"
+        info = MagicMock()
+        info.language = "en"
+        info.duration = 1.0
+
+        # First model loads fine (auto), but transcribe() blows up on dlopen
+        gpu_model = MagicMock()
+        gpu_model.transcribe.side_effect = RuntimeError(
+            "Library libcublas.so.12 is not found or cannot be loaded"
+        )
+        # Second model (forced CPU) works
+        cpu_model = MagicMock()
+        cpu_model.transcribe.return_value = ([seg], info)
+
+        models = [gpu_model, cpu_model]
+        call_args = []
+
+        def fake_whisper(model_name, device, compute_type):
+            call_args.append((device, compute_type))
+            return models.pop(0)
+
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
+             patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
+             patch("tools.transcription_tools._local_model", None), \
+             patch("tools.transcription_tools._local_model_name", None):
+            from tools.transcription_tools import _transcribe_local
+            result = _transcribe_local(str(audio), "base")
+
+        assert result["success"] is True
+        assert result["transcript"] == "recovered"
+        # First load is auto, retry forces CPU.
+        assert call_args == [("auto", "auto"), ("cpu", "int8")]
+        # Cached-bad-model eviction: the broken GPU model was called once,
+        # then discarded; the CPU model served the retry.
+        assert gpu_model.transcribe.call_count == 1
+        assert cpu_model.transcribe.call_count == 1
+
+    def test_cuda_out_of_memory_does_not_trigger_cpu_fallback(self, tmp_path):
+        """'CUDA out of memory' is a real error, not a missing lib — surface it."""
+        audio = tmp_path / "test.ogg"
+        audio.write_bytes(b"fake")
+
+        mock_whisper_cls = MagicMock(side_effect=RuntimeError("CUDA out of memory"))
+
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
+             patch("faster_whisper.WhisperModel", mock_whisper_cls), \
+             patch("tools.transcription_tools._local_model", None), \
+             patch("tools.transcription_tools._local_model_name", None):
+            from tools.transcription_tools import _transcribe_local
+            result = _transcribe_local(str(audio), "base")
+
+        # Single call — no CPU retry, because OOM isn't a missing-lib symptom.
+        assert mock_whisper_cls.call_count == 1
+        assert result["success"] is False
+        assert "CUDA out of memory" in result["error"]
+

 # ============================================================================
 # Model auto-correction