fix(transcription): fall back to CPU when CUDA runtime libs are missing

faster-whisper's device="auto" picks CUDA when ctranslate2's wheel
ships CUDA shared libs, even on hosts without the NVIDIA runtime
(libcublas.so.12 / libcudnn*). On those hosts the model often loads
fine but transcribe() fails at first dlopen, and the broken model
stays cached in the module-global — every subsequent voice message
in the gateway process fails identically until restart.

- Add _load_local_whisper_model() wrapper: try auto, catch missing-lib
  errors, retry on device=cpu compute_type=int8.
- Wrap transcribe() with the same fallback: evict cached model, reload
  on CPU, retry once. Required because the dlopen failure only surfaces
  at first kernel launch, not at model construction.
- Narrow marker list (libcublas, libcudnn, libcudart, 'cannot be loaded',
  'no kernel image is available', 'no CUDA-capable device', driver
  mismatch). Deliberately excludes 'CUDA out of memory' and similar —
  those are real runtime failures that should surface, not be silently
  retried on CPU.
- Tests for load-time fallback, runtime fallback (with cached-model
  eviction verified), and the OOM non-fallback path.

Reported via Telegram voice-message dumps on WSL2 hosts where libcublas
isn't installed by default.
This commit is contained in:
Teknium 2026-04-24 02:49:18 -07:00 committed by Teknium
parent 34c3e67109
commit 4350668ae4
2 changed files with 179 additions and 4 deletions

View file

@ -505,6 +505,101 @@ class TestTranscribeLocalExtended:
assert result["success"] is True
assert result["transcript"] == "Hello world"
def test_load_time_cuda_lib_failure_falls_back_to_cpu(self, tmp_path):
"""Missing libcublas at load time → reload on CPU, succeed."""
audio = tmp_path / "test.ogg"
audio.write_bytes(b"fake")
seg = MagicMock()
seg.text = "hi"
info = MagicMock()
info.language = "en"
info.duration = 1.0
cpu_model = MagicMock()
cpu_model.transcribe.return_value = ([seg], info)
call_args = []
def fake_whisper(model_name, device, compute_type):
call_args.append((device, compute_type))
if device == "auto":
raise RuntimeError("Library libcublas.so.12 is not found or cannot be loaded")
return cpu_model
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
patch("tools.transcription_tools._local_model", None), \
patch("tools.transcription_tools._local_model_name", None):
from tools.transcription_tools import _transcribe_local
result = _transcribe_local(str(audio), "base")
assert result["success"] is True
assert result["transcript"] == "hi"
assert call_args == [("auto", "auto"), ("cpu", "int8")]
def test_runtime_cuda_lib_failure_evicts_cache_and_retries_on_cpu(self, tmp_path):
"""libcublas dlopen fails at transcribe() → evict cache, reload CPU, retry."""
audio = tmp_path / "test.ogg"
audio.write_bytes(b"fake")
seg = MagicMock()
seg.text = "recovered"
info = MagicMock()
info.language = "en"
info.duration = 1.0
# First model loads fine (auto), but transcribe() blows up on dlopen
gpu_model = MagicMock()
gpu_model.transcribe.side_effect = RuntimeError(
"Library libcublas.so.12 is not found or cannot be loaded"
)
# Second model (forced CPU) works
cpu_model = MagicMock()
cpu_model.transcribe.return_value = ([seg], info)
models = [gpu_model, cpu_model]
call_args = []
def fake_whisper(model_name, device, compute_type):
call_args.append((device, compute_type))
return models.pop(0)
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
patch("tools.transcription_tools._local_model", None), \
patch("tools.transcription_tools._local_model_name", None):
from tools.transcription_tools import _transcribe_local
result = _transcribe_local(str(audio), "base")
assert result["success"] is True
assert result["transcript"] == "recovered"
# First load is auto, retry forces CPU.
assert call_args == [("auto", "auto"), ("cpu", "int8")]
# Cached-bad-model eviction: the broken GPU model was called once,
# then discarded; the CPU model served the retry.
assert gpu_model.transcribe.call_count == 1
assert cpu_model.transcribe.call_count == 1
def test_cuda_out_of_memory_does_not_trigger_cpu_fallback(self, tmp_path):
"""'CUDA out of memory' is a real error, not a missing lib — surface it."""
audio = tmp_path / "test.ogg"
audio.write_bytes(b"fake")
mock_whisper_cls = MagicMock(side_effect=RuntimeError("CUDA out of memory"))
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
patch("faster_whisper.WhisperModel", mock_whisper_cls), \
patch("tools.transcription_tools._local_model", None), \
patch("tools.transcription_tools._local_model_name", None):
from tools.transcription_tools import _transcribe_local
result = _transcribe_local(str(audio), "base")
# Single call — no CPU retry, because OOM isn't a missing-lib symptom.
assert mock_whisper_cls.call_count == 1
assert result["success"] is False
assert "CUDA out of memory" in result["error"]
# ============================================================================
# Model auto-correction