mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(transcription): fall back to CPU when CUDA runtime libs are missing
faster-whisper's device="auto" picks CUDA when ctranslate2's wheel ships CUDA shared libs, even on hosts without the NVIDIA runtime (libcublas.so.12 / libcudnn*). On those hosts the model often loads fine but transcribe() fails at first dlopen, and the broken model stays cached in the module-global — every subsequent voice message in the gateway process fails identically until restart. - Add _load_local_whisper_model() wrapper: try auto, catch missing-lib errors, retry on device=cpu compute_type=int8. - Wrap transcribe() with the same fallback: evict cached model, reload on CPU, retry once. Required because the dlopen failure only surfaces at first kernel launch, not at model construction. - Narrow marker list (libcublas, libcudnn, libcudart, 'cannot be loaded', 'no kernel image is available', 'no CUDA-capable device', driver mismatch). Deliberately excludes 'CUDA out of memory' and similar — those are real runtime failures that should surface, not be silently retried on CPU. - Tests for load-time fallback, runtime fallback (with cached-model eviction verified), and the OOM non-fallback path. Reported via Telegram voice-message dumps on WSL2 hosts where libcublas isn't installed by default.
This commit is contained in:
parent
34c3e67109
commit
4350668ae4
2 changed files with 179 additions and 4 deletions
|
|
@ -505,6 +505,101 @@ class TestTranscribeLocalExtended:
|
||||||
assert result["success"] is True
|
assert result["success"] is True
|
||||||
assert result["transcript"] == "Hello world"
|
assert result["transcript"] == "Hello world"
|
||||||
|
|
||||||
|
def test_load_time_cuda_lib_failure_falls_back_to_cpu(self, tmp_path):
|
||||||
|
"""Missing libcublas at load time → reload on CPU, succeed."""
|
||||||
|
audio = tmp_path / "test.ogg"
|
||||||
|
audio.write_bytes(b"fake")
|
||||||
|
|
||||||
|
seg = MagicMock()
|
||||||
|
seg.text = "hi"
|
||||||
|
info = MagicMock()
|
||||||
|
info.language = "en"
|
||||||
|
info.duration = 1.0
|
||||||
|
|
||||||
|
cpu_model = MagicMock()
|
||||||
|
cpu_model.transcribe.return_value = ([seg], info)
|
||||||
|
|
||||||
|
call_args = []
|
||||||
|
|
||||||
|
def fake_whisper(model_name, device, compute_type):
|
||||||
|
call_args.append((device, compute_type))
|
||||||
|
if device == "auto":
|
||||||
|
raise RuntimeError("Library libcublas.so.12 is not found or cannot be loaded")
|
||||||
|
return cpu_model
|
||||||
|
|
||||||
|
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
|
||||||
|
patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
|
||||||
|
patch("tools.transcription_tools._local_model", None), \
|
||||||
|
patch("tools.transcription_tools._local_model_name", None):
|
||||||
|
from tools.transcription_tools import _transcribe_local
|
||||||
|
result = _transcribe_local(str(audio), "base")
|
||||||
|
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["transcript"] == "hi"
|
||||||
|
assert call_args == [("auto", "auto"), ("cpu", "int8")]
|
||||||
|
|
||||||
|
def test_runtime_cuda_lib_failure_evicts_cache_and_retries_on_cpu(self, tmp_path):
|
||||||
|
"""libcublas dlopen fails at transcribe() → evict cache, reload CPU, retry."""
|
||||||
|
audio = tmp_path / "test.ogg"
|
||||||
|
audio.write_bytes(b"fake")
|
||||||
|
|
||||||
|
seg = MagicMock()
|
||||||
|
seg.text = "recovered"
|
||||||
|
info = MagicMock()
|
||||||
|
info.language = "en"
|
||||||
|
info.duration = 1.0
|
||||||
|
|
||||||
|
# First model loads fine (auto), but transcribe() blows up on dlopen
|
||||||
|
gpu_model = MagicMock()
|
||||||
|
gpu_model.transcribe.side_effect = RuntimeError(
|
||||||
|
"Library libcublas.so.12 is not found or cannot be loaded"
|
||||||
|
)
|
||||||
|
# Second model (forced CPU) works
|
||||||
|
cpu_model = MagicMock()
|
||||||
|
cpu_model.transcribe.return_value = ([seg], info)
|
||||||
|
|
||||||
|
models = [gpu_model, cpu_model]
|
||||||
|
call_args = []
|
||||||
|
|
||||||
|
def fake_whisper(model_name, device, compute_type):
|
||||||
|
call_args.append((device, compute_type))
|
||||||
|
return models.pop(0)
|
||||||
|
|
||||||
|
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
|
||||||
|
patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
|
||||||
|
patch("tools.transcription_tools._local_model", None), \
|
||||||
|
patch("tools.transcription_tools._local_model_name", None):
|
||||||
|
from tools.transcription_tools import _transcribe_local
|
||||||
|
result = _transcribe_local(str(audio), "base")
|
||||||
|
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["transcript"] == "recovered"
|
||||||
|
# First load is auto, retry forces CPU.
|
||||||
|
assert call_args == [("auto", "auto"), ("cpu", "int8")]
|
||||||
|
# Cached-bad-model eviction: the broken GPU model was called once,
|
||||||
|
# then discarded; the CPU model served the retry.
|
||||||
|
assert gpu_model.transcribe.call_count == 1
|
||||||
|
assert cpu_model.transcribe.call_count == 1
|
||||||
|
|
||||||
|
def test_cuda_out_of_memory_does_not_trigger_cpu_fallback(self, tmp_path):
|
||||||
|
"""'CUDA out of memory' is a real error, not a missing lib — surface it."""
|
||||||
|
audio = tmp_path / "test.ogg"
|
||||||
|
audio.write_bytes(b"fake")
|
||||||
|
|
||||||
|
mock_whisper_cls = MagicMock(side_effect=RuntimeError("CUDA out of memory"))
|
||||||
|
|
||||||
|
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
|
||||||
|
patch("faster_whisper.WhisperModel", mock_whisper_cls), \
|
||||||
|
patch("tools.transcription_tools._local_model", None), \
|
||||||
|
patch("tools.transcription_tools._local_model_name", None):
|
||||||
|
from tools.transcription_tools import _transcribe_local
|
||||||
|
result = _transcribe_local(str(audio), "base")
|
||||||
|
|
||||||
|
# Single call — no CPU retry, because OOM isn't a missing-lib symptom.
|
||||||
|
assert mock_whisper_cls.call_count == 1
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "CUDA out of memory" in result["error"]
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# Model auto-correction
|
# Model auto-correction
|
||||||
|
|
|
||||||
|
|
@ -313,6 +313,66 @@ def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]:
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
# Substrings that identify a missing/unloadable CUDA runtime library. When
|
||||||
|
# ctranslate2 (the backend for faster-whisper) cannot dlopen one of these, the
|
||||||
|
# "auto" device picker has already committed to CUDA and the model can no
|
||||||
|
# longer be used — we fall back to CPU and reload.
|
||||||
|
#
|
||||||
|
# Deliberately narrow: we match on library-name tokens and dlopen phrasing so
|
||||||
|
# we DO NOT accidentally catch legitimate runtime failures like "CUDA out of
|
||||||
|
# memory" — those should surface to the user, not silently fall back to CPU
|
||||||
|
# (a 32GB audio clip on CPU at int8 isn't useful either).
|
||||||
|
_CUDA_LIB_ERROR_MARKERS = (
|
||||||
|
"libcublas",
|
||||||
|
"libcudnn",
|
||||||
|
"libcudart",
|
||||||
|
"cannot be loaded",
|
||||||
|
"cannot open shared object",
|
||||||
|
"no kernel image is available",
|
||||||
|
"no CUDA-capable device",
|
||||||
|
"CUDA driver version is insufficient",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_cuda_lib_error(exc: BaseException) -> bool:
|
||||||
|
"""Heuristic: is this exception a missing/broken CUDA runtime library?
|
||||||
|
|
||||||
|
ctranslate2 raises plain RuntimeError with messages like
|
||||||
|
``Library libcublas.so.12 is not found or cannot be loaded``. We want to
|
||||||
|
catch missing/unloadable shared libs and driver-mismatch errors, NOT
|
||||||
|
legitimate runtime failures ("CUDA out of memory", model bugs, etc.).
|
||||||
|
"""
|
||||||
|
msg = str(exc)
|
||||||
|
return any(marker in msg for marker in _CUDA_LIB_ERROR_MARKERS)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_local_whisper_model(model_name: str):
|
||||||
|
"""Load faster-whisper with graceful CUDA → CPU fallback.
|
||||||
|
|
||||||
|
faster-whisper's ``device="auto"`` picks CUDA when the ctranslate2 wheel
|
||||||
|
ships CUDA shared libs, even on hosts where the NVIDIA runtime
|
||||||
|
(``libcublas.so.12`` / ``libcudnn*``) isn't installed — common on WSL2
|
||||||
|
without CUDA-on-WSL, headless servers, and CPU-only developer machines.
|
||||||
|
On those hosts the load itself sometimes succeeds and the dlopen failure
|
||||||
|
only surfaces at first ``transcribe()`` call.
|
||||||
|
|
||||||
|
We try ``auto`` first (fast CUDA path when it works), and on any CUDA
|
||||||
|
library load failure fall back to CPU + int8.
|
||||||
|
"""
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
try:
|
||||||
|
return WhisperModel(model_name, device="auto", compute_type="auto")
|
||||||
|
except Exception as exc:
|
||||||
|
if not _looks_like_cuda_lib_error(exc):
|
||||||
|
raise
|
||||||
|
logger.warning(
|
||||||
|
"faster-whisper CUDA load failed (%s) — falling back to CPU (int8). "
|
||||||
|
"Install the NVIDIA CUDA runtime (libcublas/libcudnn) to use GPU.",
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
return WhisperModel(model_name, device="cpu", compute_type="int8")
|
||||||
|
|
||||||
|
|
||||||
def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
|
def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||||
"""Transcribe using faster-whisper (local, free)."""
|
"""Transcribe using faster-whisper (local, free)."""
|
||||||
global _local_model, _local_model_name
|
global _local_model, _local_model_name
|
||||||
|
|
@ -321,11 +381,10 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||||
return {"success": False, "transcript": "", "error": "faster-whisper not installed"}
|
return {"success": False, "transcript": "", "error": "faster-whisper not installed"}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from faster_whisper import WhisperModel
|
|
||||||
# Lazy-load the model (downloads on first use, ~150 MB for 'base')
|
# Lazy-load the model (downloads on first use, ~150 MB for 'base')
|
||||||
if _local_model is None or _local_model_name != model_name:
|
if _local_model is None or _local_model_name != model_name:
|
||||||
logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name)
|
logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name)
|
||||||
_local_model = WhisperModel(model_name, device="auto", compute_type="auto")
|
_local_model = _load_local_whisper_model(model_name)
|
||||||
_local_model_name = model_name
|
_local_model_name = model_name
|
||||||
|
|
||||||
# Language: config.yaml (stt.local.language) > env var > auto-detect.
|
# Language: config.yaml (stt.local.language) > env var > auto-detect.
|
||||||
|
|
@ -338,6 +397,27 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||||
if _forced_lang:
|
if _forced_lang:
|
||||||
transcribe_kwargs["language"] = _forced_lang
|
transcribe_kwargs["language"] = _forced_lang
|
||||||
|
|
||||||
|
try:
|
||||||
|
segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
|
||||||
|
transcript = " ".join(segment.text.strip() for segment in segments)
|
||||||
|
except Exception as exc:
|
||||||
|
# CUDA runtime libs sometimes only fail at dlopen-on-first-use,
|
||||||
|
# AFTER the model loaded successfully. Evict the broken cached
|
||||||
|
# model, reload on CPU, retry once. Without this the module-
|
||||||
|
# global `_local_model` is poisoned and every subsequent voice
|
||||||
|
# message on this process fails identically until restart.
|
||||||
|
if not _looks_like_cuda_lib_error(exc):
|
||||||
|
raise
|
||||||
|
logger.warning(
|
||||||
|
"faster-whisper CUDA runtime failed mid-transcribe (%s) — "
|
||||||
|
"evicting cached model and retrying on CPU (int8).",
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
_local_model = None
|
||||||
|
_local_model_name = None
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
_local_model = WhisperModel(model_name, device="cpu", compute_type="int8")
|
||||||
|
_local_model_name = model_name
|
||||||
segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
|
segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
|
||||||
transcript = " ".join(segment.text.strip() for segment in segments)
|
transcript = " ".join(segment.text.strip() for segment in segments)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue