fix(transcription): fall back to CPU when CUDA runtime libs are missing

faster-whisper's device="auto" picks CUDA when ctranslate2's wheel
ships CUDA shared libs, even on hosts without the NVIDIA runtime
(libcublas.so.12 / libcudnn*). On those hosts the model often loads
fine but transcribe() fails at first dlopen, and the broken model
stays cached in the module-global — every subsequent voice message
in the gateway process fails identically until restart.

- Add _load_local_whisper_model() wrapper: try auto, catch missing-lib
  errors, retry on device=cpu compute_type=int8.
- Wrap transcribe() with the same fallback: evict cached model, reload
  on CPU, retry once. Required because the dlopen failure only surfaces
  at first kernel launch, not at model construction.
- Narrow marker list (libcublas, libcudnn, libcudart, 'cannot be loaded',
  'no kernel image is available', 'no CUDA-capable device', driver
  mismatch). Deliberately excludes 'CUDA out of memory' and similar —
  those are real runtime failures that should surface, not be silently
  retried on CPU.
- Tests for load-time fallback, runtime fallback (with cached-model
  eviction verified), and the OOM non-fallback path.

Reported via Telegram voice-message dumps on WSL2 hosts where libcublas
isn't installed by default.
This commit is contained in:
Teknium 2026-04-24 02:49:18 -07:00 committed by Teknium
parent 34c3e67109
commit 4350668ae4
2 changed files with 179 additions and 4 deletions

View file

@ -505,6 +505,101 @@ class TestTranscribeLocalExtended:
assert result["success"] is True assert result["success"] is True
assert result["transcript"] == "Hello world" assert result["transcript"] == "Hello world"
def test_load_time_cuda_lib_failure_falls_back_to_cpu(self, tmp_path):
"""Missing libcublas at load time → reload on CPU, succeed."""
audio = tmp_path / "test.ogg"
audio.write_bytes(b"fake")
seg = MagicMock()
seg.text = "hi"
info = MagicMock()
info.language = "en"
info.duration = 1.0
cpu_model = MagicMock()
cpu_model.transcribe.return_value = ([seg], info)
call_args = []
def fake_whisper(model_name, device, compute_type):
call_args.append((device, compute_type))
if device == "auto":
raise RuntimeError("Library libcublas.so.12 is not found or cannot be loaded")
return cpu_model
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
patch("tools.transcription_tools._local_model", None), \
patch("tools.transcription_tools._local_model_name", None):
from tools.transcription_tools import _transcribe_local
result = _transcribe_local(str(audio), "base")
assert result["success"] is True
assert result["transcript"] == "hi"
assert call_args == [("auto", "auto"), ("cpu", "int8")]
def test_runtime_cuda_lib_failure_evicts_cache_and_retries_on_cpu(self, tmp_path):
"""libcublas dlopen fails at transcribe() → evict cache, reload CPU, retry."""
audio = tmp_path / "test.ogg"
audio.write_bytes(b"fake")
seg = MagicMock()
seg.text = "recovered"
info = MagicMock()
info.language = "en"
info.duration = 1.0
# First model loads fine (auto), but transcribe() blows up on dlopen
gpu_model = MagicMock()
gpu_model.transcribe.side_effect = RuntimeError(
"Library libcublas.so.12 is not found or cannot be loaded"
)
# Second model (forced CPU) works
cpu_model = MagicMock()
cpu_model.transcribe.return_value = ([seg], info)
models = [gpu_model, cpu_model]
call_args = []
def fake_whisper(model_name, device, compute_type):
call_args.append((device, compute_type))
return models.pop(0)
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
patch("tools.transcription_tools._local_model", None), \
patch("tools.transcription_tools._local_model_name", None):
from tools.transcription_tools import _transcribe_local
result = _transcribe_local(str(audio), "base")
assert result["success"] is True
assert result["transcript"] == "recovered"
# First load is auto, retry forces CPU.
assert call_args == [("auto", "auto"), ("cpu", "int8")]
# Cached-bad-model eviction: the broken GPU model was called once,
# then discarded; the CPU model served the retry.
assert gpu_model.transcribe.call_count == 1
assert cpu_model.transcribe.call_count == 1
def test_cuda_out_of_memory_does_not_trigger_cpu_fallback(self, tmp_path):
"""'CUDA out of memory' is a real error, not a missing lib — surface it."""
audio = tmp_path / "test.ogg"
audio.write_bytes(b"fake")
mock_whisper_cls = MagicMock(side_effect=RuntimeError("CUDA out of memory"))
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
patch("faster_whisper.WhisperModel", mock_whisper_cls), \
patch("tools.transcription_tools._local_model", None), \
patch("tools.transcription_tools._local_model_name", None):
from tools.transcription_tools import _transcribe_local
result = _transcribe_local(str(audio), "base")
# Single call — no CPU retry, because OOM isn't a missing-lib symptom.
assert mock_whisper_cls.call_count == 1
assert result["success"] is False
assert "CUDA out of memory" in result["error"]
# ============================================================================ # ============================================================================
# Model auto-correction # Model auto-correction

View file

@ -313,6 +313,66 @@ def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Substrings that identify a missing/unloadable CUDA runtime library. When
# ctranslate2 (the backend for faster-whisper) cannot dlopen one of these, the
# "auto" device picker has already committed to CUDA and the model can no
# longer be used — we fall back to CPU and reload.
#
# Deliberately narrow: we match on library-name tokens and dlopen phrasing so
# we DO NOT accidentally catch legitimate runtime failures like "CUDA out of
# memory" — those should surface to the user, not silently fall back to CPU
# (a 32GB audio clip on CPU at int8 isn't useful either).
_CUDA_LIB_ERROR_MARKERS = (
"libcublas",
"libcudnn",
"libcudart",
"cannot be loaded",
"cannot open shared object",
"no kernel image is available",
"no CUDA-capable device",
"CUDA driver version is insufficient",
)
def _looks_like_cuda_lib_error(exc: BaseException) -> bool:
"""Heuristic: is this exception a missing/broken CUDA runtime library?
ctranslate2 raises plain RuntimeError with messages like
``Library libcublas.so.12 is not found or cannot be loaded``. We want to
catch missing/unloadable shared libs and driver-mismatch errors, NOT
legitimate runtime failures ("CUDA out of memory", model bugs, etc.).
"""
msg = str(exc)
return any(marker in msg for marker in _CUDA_LIB_ERROR_MARKERS)
def _load_local_whisper_model(model_name: str):
"""Load faster-whisper with graceful CUDA → CPU fallback.
faster-whisper's ``device="auto"`` picks CUDA when the ctranslate2 wheel
ships CUDA shared libs, even on hosts where the NVIDIA runtime
(``libcublas.so.12`` / ``libcudnn*``) isn't installed — common on WSL2
without CUDA-on-WSL, headless servers, and CPU-only developer machines.
On those hosts the load itself sometimes succeeds and the dlopen failure
only surfaces at first ``transcribe()`` call.
We try ``auto`` first (fast CUDA path when it works), and on any CUDA
library load failure fall back to CPU + int8.
"""
from faster_whisper import WhisperModel
try:
return WhisperModel(model_name, device="auto", compute_type="auto")
except Exception as exc:
if not _looks_like_cuda_lib_error(exc):
raise
logger.warning(
"faster-whisper CUDA load failed (%s) — falling back to CPU (int8). "
"Install the NVIDIA CUDA runtime (libcublas/libcudnn) to use GPU.",
exc,
)
return WhisperModel(model_name, device="cpu", compute_type="int8")
def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]: def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
"""Transcribe using faster-whisper (local, free).""" """Transcribe using faster-whisper (local, free)."""
global _local_model, _local_model_name global _local_model, _local_model_name
@ -321,11 +381,10 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
return {"success": False, "transcript": "", "error": "faster-whisper not installed"} return {"success": False, "transcript": "", "error": "faster-whisper not installed"}
try: try:
from faster_whisper import WhisperModel
# Lazy-load the model (downloads on first use, ~150 MB for 'base') # Lazy-load the model (downloads on first use, ~150 MB for 'base')
if _local_model is None or _local_model_name != model_name: if _local_model is None or _local_model_name != model_name:
logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name) logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name)
_local_model = WhisperModel(model_name, device="auto", compute_type="auto") _local_model = _load_local_whisper_model(model_name)
_local_model_name = model_name _local_model_name = model_name
# Language: config.yaml (stt.local.language) > env var > auto-detect. # Language: config.yaml (stt.local.language) > env var > auto-detect.
@ -338,6 +397,27 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
if _forced_lang: if _forced_lang:
transcribe_kwargs["language"] = _forced_lang transcribe_kwargs["language"] = _forced_lang
try:
segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
transcript = " ".join(segment.text.strip() for segment in segments)
except Exception as exc:
# CUDA runtime libs sometimes only fail at dlopen-on-first-use,
# AFTER the model loaded successfully. Evict the broken cached
# model, reload on CPU, retry once. Without this the module-
# global `_local_model` is poisoned and every subsequent voice
# message on this process fails identically until restart.
if not _looks_like_cuda_lib_error(exc):
raise
logger.warning(
"faster-whisper CUDA runtime failed mid-transcribe (%s) — "
"evicting cached model and retrying on CPU (int8).",
exc,
)
_local_model = None
_local_model_name = None
from faster_whisper import WhisperModel
_local_model = WhisperModel(model_name, device="cpu", compute_type="int8")
_local_model_name = model_name
segments, info = _local_model.transcribe(file_path, **transcribe_kwargs) segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
transcript = " ".join(segment.text.strip() for segment in segments) transcript = " ".join(segment.text.strip() for segment in segments)