diff --git a/tests/tools/test_transcription_tools.py b/tests/tools/test_transcription_tools.py index 9e753af53..50cbe22a6 100644 --- a/tests/tools/test_transcription_tools.py +++ b/tests/tools/test_transcription_tools.py @@ -505,6 +505,101 @@ class TestTranscribeLocalExtended: assert result["success"] is True assert result["transcript"] == "Hello world" + def test_load_time_cuda_lib_failure_falls_back_to_cpu(self, tmp_path): + """Missing libcublas at load time → reload on CPU, succeed.""" + audio = tmp_path / "test.ogg" + audio.write_bytes(b"fake") + + seg = MagicMock() + seg.text = "hi" + info = MagicMock() + info.language = "en" + info.duration = 1.0 + + cpu_model = MagicMock() + cpu_model.transcribe.return_value = ([seg], info) + + call_args = [] + + def fake_whisper(model_name, device, compute_type): + call_args.append((device, compute_type)) + if device == "auto": + raise RuntimeError("Library libcublas.so.12 is not found or cannot be loaded") + return cpu_model + + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \ + patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \ + patch("tools.transcription_tools._local_model", None), \ + patch("tools.transcription_tools._local_model_name", None): + from tools.transcription_tools import _transcribe_local + result = _transcribe_local(str(audio), "base") + + assert result["success"] is True + assert result["transcript"] == "hi" + assert call_args == [("auto", "auto"), ("cpu", "int8")] + + def test_runtime_cuda_lib_failure_evicts_cache_and_retries_on_cpu(self, tmp_path): + """libcublas dlopen fails at transcribe() → evict cache, reload CPU, retry.""" + audio = tmp_path / "test.ogg" + audio.write_bytes(b"fake") + + seg = MagicMock() + seg.text = "recovered" + info = MagicMock() + info.language = "en" + info.duration = 1.0 + + # First model loads fine (auto), but transcribe() blows up on dlopen + gpu_model = MagicMock() + gpu_model.transcribe.side_effect = RuntimeError( + "Library libcublas.so.12 is not found or cannot be loaded" + ) + # Second model (forced CPU) works + cpu_model = MagicMock() + cpu_model.transcribe.return_value = ([seg], info) + + models = [gpu_model, cpu_model] + call_args = [] + + def fake_whisper(model_name, device, compute_type): + call_args.append((device, compute_type)) + return models.pop(0) + + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \ + patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \ + patch("tools.transcription_tools._local_model", None), \ + patch("tools.transcription_tools._local_model_name", None): + from tools.transcription_tools import _transcribe_local + result = _transcribe_local(str(audio), "base") + + assert result["success"] is True + assert result["transcript"] == "recovered" + # First load is auto, retry forces CPU. + assert call_args == [("auto", "auto"), ("cpu", "int8")] + # Cached-bad-model eviction: the broken GPU model was called once, + # then discarded; the CPU model served the retry. + assert gpu_model.transcribe.call_count == 1 + assert cpu_model.transcribe.call_count == 1 + + def test_cuda_out_of_memory_does_not_trigger_cpu_fallback(self, tmp_path): + """'CUDA out of memory' is a real error, not a missing lib — surface it.""" + audio = tmp_path / "test.ogg" + audio.write_bytes(b"fake") + + mock_whisper_cls = MagicMock(side_effect=RuntimeError("CUDA out of memory")) + + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \ + patch("faster_whisper.WhisperModel", mock_whisper_cls), \ + patch("tools.transcription_tools._local_model", None), \ + patch("tools.transcription_tools._local_model_name", None): + from tools.transcription_tools import _transcribe_local + result = _transcribe_local(str(audio), "base") + + # Single call — no CPU retry, because OOM isn't a missing-lib symptom. + assert mock_whisper_cls.call_count == 1 + assert result["success"] is False + assert "CUDA out of memory" in result["error"] + # ============================================================================ # Model auto-correction diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index f57e191e3..9e8ad6927 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -313,6 +313,66 @@ def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]: # --------------------------------------------------------------------------- +# Substrings that identify a missing/unloadable CUDA runtime library. When +# ctranslate2 (the backend for faster-whisper) cannot dlopen one of these, the +# "auto" device picker has already committed to CUDA and the model can no +# longer be used — we fall back to CPU and reload. +# +# Deliberately narrow: we match on library-name tokens and dlopen phrasing so +# we DO NOT accidentally catch legitimate runtime failures like "CUDA out of +# memory" — those should surface to the user, not silently fall back to CPU +# (a 32GB audio clip on CPU at int8 isn't useful either). +_CUDA_LIB_ERROR_MARKERS = ( + "libcublas", + "libcudnn", + "libcudart", + "cannot be loaded", + "cannot open shared object", + "no kernel image is available", + "no CUDA-capable device", + "CUDA driver version is insufficient", +) + + +def _looks_like_cuda_lib_error(exc: BaseException) -> bool: + """Heuristic: is this exception a missing/broken CUDA runtime library? + + ctranslate2 raises plain RuntimeError with messages like + ``Library libcublas.so.12 is not found or cannot be loaded``. We want to + catch missing/unloadable shared libs and driver-mismatch errors, NOT + legitimate runtime failures ("CUDA out of memory", model bugs, etc.). + """ + msg = str(exc) + return any(marker in msg for marker in _CUDA_LIB_ERROR_MARKERS) + + +def _load_local_whisper_model(model_name: str): + """Load faster-whisper with graceful CUDA → CPU fallback. + + faster-whisper's ``device="auto"`` picks CUDA when the ctranslate2 wheel + ships CUDA shared libs, even on hosts where the NVIDIA runtime + (``libcublas.so.12`` / ``libcudnn*``) isn't installed — common on WSL2 + without CUDA-on-WSL, headless servers, and CPU-only developer machines. + On those hosts the load itself sometimes succeeds and the dlopen failure + only surfaces at first ``transcribe()`` call. + + We try ``auto`` first (fast CUDA path when it works), and on any CUDA + library load failure fall back to CPU + int8. + """ + from faster_whisper import WhisperModel + try: + return WhisperModel(model_name, device="auto", compute_type="auto") + except Exception as exc: + if not _looks_like_cuda_lib_error(exc): + raise + logger.warning( + "faster-whisper CUDA load failed (%s) — falling back to CPU (int8). " + "Install the NVIDIA CUDA runtime (libcublas/libcudnn) to use GPU.", + exc, + ) + return WhisperModel(model_name, device="cpu", compute_type="int8") + + def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]: """Transcribe using faster-whisper (local, free).""" global _local_model, _local_model_name @@ -321,11 +381,10 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]: return {"success": False, "transcript": "", "error": "faster-whisper not installed"} try: - from faster_whisper import WhisperModel # Lazy-load the model (downloads on first use, ~150 MB for 'base') if _local_model is None or _local_model_name != model_name: logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name) - _local_model = WhisperModel(model_name, device="auto", compute_type="auto") + _local_model = _load_local_whisper_model(model_name) _local_model_name = model_name # Language: config.yaml (stt.local.language) > env var > auto-detect. @@ -338,8 +397,29 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]: if _forced_lang: transcribe_kwargs["language"] = _forced_lang - segments, info = _local_model.transcribe(file_path, **transcribe_kwargs) - transcript = " ".join(segment.text.strip() for segment in segments) + try: + segments, info = _local_model.transcribe(file_path, **transcribe_kwargs) + transcript = " ".join(segment.text.strip() for segment in segments) + except Exception as exc: + # CUDA runtime libs sometimes only fail at dlopen-on-first-use, + # AFTER the model loaded successfully. Evict the broken cached + # model, reload on CPU, retry once. Without this the module- + # global `_local_model` is poisoned and every subsequent voice + # message on this process fails identically until restart. + if not _looks_like_cuda_lib_error(exc): + raise + logger.warning( + "faster-whisper CUDA runtime failed mid-transcribe (%s) — " + "evicting cached model and retrying on CPU (int8).", + exc, + ) + _local_model = None + _local_model_name = None + from faster_whisper import WhisperModel + _local_model = WhisperModel(model_name, device="cpu", compute_type="int8") + _local_model_name = model_name + segments, info = _local_model.transcribe(file_path, **transcribe_kwargs) + transcript = " ".join(segment.text.strip() for segment in segments) logger.info( "Transcribed %s via local whisper (%s, lang=%s, %.1fs audio)",