mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(transcription): fall back to CPU when CUDA runtime libs are missing
faster-whisper's device="auto" picks CUDA when ctranslate2's wheel ships CUDA shared libs, even on hosts without the NVIDIA runtime (libcublas.so.12 / libcudnn*). On those hosts the model often loads fine but transcribe() fails at first dlopen, and the broken model stays cached in the module-global — every subsequent voice message in the gateway process fails identically until restart. - Add _load_local_whisper_model() wrapper: try auto, catch missing-lib errors, retry on device=cpu compute_type=int8. - Wrap transcribe() with the same fallback: evict cached model, reload on CPU, retry once. Required because the dlopen failure only surfaces at first kernel launch, not at model construction. - Narrow marker list (libcublas, libcudnn, libcudart, 'cannot be loaded', 'no kernel image is available', 'no CUDA-capable device', driver mismatch). Deliberately excludes 'CUDA out of memory' and similar — those are real runtime failures that should surface, not be silently retried on CPU. - Tests for load-time fallback, runtime fallback (with cached-model eviction verified), and the OOM non-fallback path. Reported via Telegram voice-message dumps on WSL2 hosts where libcublas isn't installed by default.
This commit is contained in:
parent
34c3e67109
commit
4350668ae4
2 changed files with 179 additions and 4 deletions
|
|
@ -505,6 +505,101 @@ class TestTranscribeLocalExtended:
|
|||
assert result["success"] is True
|
||||
assert result["transcript"] == "Hello world"
|
||||
|
||||
def test_load_time_cuda_lib_failure_falls_back_to_cpu(self, tmp_path):
|
||||
"""Missing libcublas at load time → reload on CPU, succeed."""
|
||||
audio = tmp_path / "test.ogg"
|
||||
audio.write_bytes(b"fake")
|
||||
|
||||
seg = MagicMock()
|
||||
seg.text = "hi"
|
||||
info = MagicMock()
|
||||
info.language = "en"
|
||||
info.duration = 1.0
|
||||
|
||||
cpu_model = MagicMock()
|
||||
cpu_model.transcribe.return_value = ([seg], info)
|
||||
|
||||
call_args = []
|
||||
|
||||
def fake_whisper(model_name, device, compute_type):
|
||||
call_args.append((device, compute_type))
|
||||
if device == "auto":
|
||||
raise RuntimeError("Library libcublas.so.12 is not found or cannot be loaded")
|
||||
return cpu_model
|
||||
|
||||
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
|
||||
patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
|
||||
patch("tools.transcription_tools._local_model", None), \
|
||||
patch("tools.transcription_tools._local_model_name", None):
|
||||
from tools.transcription_tools import _transcribe_local
|
||||
result = _transcribe_local(str(audio), "base")
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["transcript"] == "hi"
|
||||
assert call_args == [("auto", "auto"), ("cpu", "int8")]
|
||||
|
||||
def test_runtime_cuda_lib_failure_evicts_cache_and_retries_on_cpu(self, tmp_path):
|
||||
"""libcublas dlopen fails at transcribe() → evict cache, reload CPU, retry."""
|
||||
audio = tmp_path / "test.ogg"
|
||||
audio.write_bytes(b"fake")
|
||||
|
||||
seg = MagicMock()
|
||||
seg.text = "recovered"
|
||||
info = MagicMock()
|
||||
info.language = "en"
|
||||
info.duration = 1.0
|
||||
|
||||
# First model loads fine (auto), but transcribe() blows up on dlopen
|
||||
gpu_model = MagicMock()
|
||||
gpu_model.transcribe.side_effect = RuntimeError(
|
||||
"Library libcublas.so.12 is not found or cannot be loaded"
|
||||
)
|
||||
# Second model (forced CPU) works
|
||||
cpu_model = MagicMock()
|
||||
cpu_model.transcribe.return_value = ([seg], info)
|
||||
|
||||
models = [gpu_model, cpu_model]
|
||||
call_args = []
|
||||
|
||||
def fake_whisper(model_name, device, compute_type):
|
||||
call_args.append((device, compute_type))
|
||||
return models.pop(0)
|
||||
|
||||
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
|
||||
patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
|
||||
patch("tools.transcription_tools._local_model", None), \
|
||||
patch("tools.transcription_tools._local_model_name", None):
|
||||
from tools.transcription_tools import _transcribe_local
|
||||
result = _transcribe_local(str(audio), "base")
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["transcript"] == "recovered"
|
||||
# First load is auto, retry forces CPU.
|
||||
assert call_args == [("auto", "auto"), ("cpu", "int8")]
|
||||
# Cached-bad-model eviction: the broken GPU model was called once,
|
||||
# then discarded; the CPU model served the retry.
|
||||
assert gpu_model.transcribe.call_count == 1
|
||||
assert cpu_model.transcribe.call_count == 1
|
||||
|
||||
def test_cuda_out_of_memory_does_not_trigger_cpu_fallback(self, tmp_path):
|
||||
"""'CUDA out of memory' is a real error, not a missing lib — surface it."""
|
||||
audio = tmp_path / "test.ogg"
|
||||
audio.write_bytes(b"fake")
|
||||
|
||||
mock_whisper_cls = MagicMock(side_effect=RuntimeError("CUDA out of memory"))
|
||||
|
||||
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
|
||||
patch("faster_whisper.WhisperModel", mock_whisper_cls), \
|
||||
patch("tools.transcription_tools._local_model", None), \
|
||||
patch("tools.transcription_tools._local_model_name", None):
|
||||
from tools.transcription_tools import _transcribe_local
|
||||
result = _transcribe_local(str(audio), "base")
|
||||
|
||||
# Single call — no CPU retry, because OOM isn't a missing-lib symptom.
|
||||
assert mock_whisper_cls.call_count == 1
|
||||
assert result["success"] is False
|
||||
assert "CUDA out of memory" in result["error"]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Model auto-correction
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue