diff --git a/tests/tools/test_transcription_tools.py b/tests/tools/test_transcription_tools.py
index 9e753af53..50cbe22a6 100644
--- a/tests/tools/test_transcription_tools.py
+++ b/tests/tools/test_transcription_tools.py
@@ -505,6 +505,101 @@ class TestTranscribeLocalExtended:
         assert result["success"] is True
         assert result["transcript"] == "Hello world"
 
+    def test_load_time_cuda_lib_failure_falls_back_to_cpu(self, tmp_path):
+        """Missing libcublas at load time → reload on CPU, succeed."""
+        audio = tmp_path / "test.ogg"
+        audio.write_bytes(b"fake")
+
+        seg = MagicMock()
+        seg.text = "hi"
+        info = MagicMock()
+        info.language = "en"
+        info.duration = 1.0
+
+        cpu_model = MagicMock()
+        cpu_model.transcribe.return_value = ([seg], info)
+
+        call_args = []
+
+        def fake_whisper(model_name, device, compute_type):
+            call_args.append((device, compute_type))
+            if device == "auto":
+                raise RuntimeError("Library libcublas.so.12 is not found or cannot be loaded")
+            return cpu_model
+
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
+             patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
+             patch("tools.transcription_tools._local_model", None), \
+             patch("tools.transcription_tools._local_model_name", None):
+            from tools.transcription_tools import _transcribe_local
+            result = _transcribe_local(str(audio), "base")
+
+        assert result["success"] is True
+        assert result["transcript"] == "hi"
+        assert call_args == [("auto", "auto"), ("cpu", "int8")]
+
+    def test_runtime_cuda_lib_failure_evicts_cache_and_retries_on_cpu(self, tmp_path):
+        """libcublas dlopen fails at transcribe() → evict cache, reload CPU, retry."""
+        audio = tmp_path / "test.ogg"
+        audio.write_bytes(b"fake")
+
+        seg = MagicMock()
+        seg.text = "recovered"
+        info = MagicMock()
+        info.language = "en"
+        info.duration = 1.0
+
+        # First model loads fine (auto), but transcribe() blows up on dlopen
+        gpu_model = MagicMock()
+        gpu_model.transcribe.side_effect = RuntimeError(
+            "Library libcublas.so.12 is not found or cannot be loaded"
+        )
+        # Second model (forced CPU) works
+        cpu_model = MagicMock()
+        cpu_model.transcribe.return_value = ([seg], info)
+
+        models = [gpu_model, cpu_model]
+        call_args = []
+
+        def fake_whisper(model_name, device, compute_type):
+            call_args.append((device, compute_type))
+            return models.pop(0)
+
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
+             patch("faster_whisper.WhisperModel", side_effect=fake_whisper), \
+             patch("tools.transcription_tools._local_model", None), \
+             patch("tools.transcription_tools._local_model_name", None):
+            from tools.transcription_tools import _transcribe_local
+            result = _transcribe_local(str(audio), "base")
+
+        assert result["success"] is True
+        assert result["transcript"] == "recovered"
+        # First load is auto, retry forces CPU.
+        assert call_args == [("auto", "auto"), ("cpu", "int8")]
+        # Cached-bad-model eviction: the broken GPU model was called once,
+        # then discarded; the CPU model served the retry.
+        assert gpu_model.transcribe.call_count == 1
+        assert cpu_model.transcribe.call_count == 1
+
+    def test_cuda_out_of_memory_does_not_trigger_cpu_fallback(self, tmp_path):
+        """'CUDA out of memory' is a real error, not a missing lib — surface it."""
+        audio = tmp_path / "test.ogg"
+        audio.write_bytes(b"fake")
+
+        mock_whisper_cls = MagicMock(side_effect=RuntimeError("CUDA out of memory"))
+
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \
+             patch("faster_whisper.WhisperModel", mock_whisper_cls), \
+             patch("tools.transcription_tools._local_model", None), \
+             patch("tools.transcription_tools._local_model_name", None):
+            from tools.transcription_tools import _transcribe_local
+            result = _transcribe_local(str(audio), "base")
+
+        # Single call — no CPU retry, because OOM isn't a missing-lib symptom.
+        assert mock_whisper_cls.call_count == 1
+        assert result["success"] is False
+        assert "CUDA out of memory" in result["error"]
+
 
 # ============================================================================
 # Model auto-correction
diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py
index f57e191e3..9e8ad6927 100644
--- a/tools/transcription_tools.py
+++ b/tools/transcription_tools.py
@@ -313,6 +313,66 @@ def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]:
 # ---------------------------------------------------------------------------
 
 
+# Substrings that identify a missing/unloadable CUDA runtime library.  When
+# ctranslate2 (the backend for faster-whisper) cannot dlopen one of these, the
+# "auto" device picker has already committed to CUDA and the model can no
+# longer be used — we fall back to CPU and reload.
+#
+# Deliberately narrow: we match on library-name tokens and dlopen phrasing so
+# we DO NOT accidentally catch legitimate runtime failures like "CUDA out of
+# memory" — those should surface to the user, not silently fall back to CPU
+# (a 32GB audio clip on CPU at int8 isn't useful either).
+_CUDA_LIB_ERROR_MARKERS = (
+    "libcublas",
+    "libcudnn",
+    "libcudart",
+    "cannot be loaded",
+    "cannot open shared object",
+    "no kernel image is available",
+    "no CUDA-capable device",
+    "CUDA driver version is insufficient",
+)
+
+
+def _looks_like_cuda_lib_error(exc: BaseException) -> bool:
+    """Heuristic: is this exception a missing/broken CUDA runtime library?
+
+    ctranslate2 raises plain RuntimeError with messages like
+    ``Library libcublas.so.12 is not found or cannot be loaded``.  We want to
+    catch missing/unloadable shared libs and driver-mismatch errors, NOT
+    legitimate runtime failures ("CUDA out of memory", model bugs, etc.).
+    """
+    msg = str(exc)
+    return any(marker in msg for marker in _CUDA_LIB_ERROR_MARKERS)
+
+
+def _load_local_whisper_model(model_name: str):
+    """Load faster-whisper with graceful CUDA → CPU fallback.
+
+    faster-whisper's ``device="auto"`` picks CUDA when the ctranslate2 wheel
+    ships CUDA shared libs, even on hosts where the NVIDIA runtime
+    (``libcublas.so.12`` / ``libcudnn*``) isn't installed — common on WSL2
+    without CUDA-on-WSL, headless servers, and CPU-only developer machines.
+    On those hosts the load itself sometimes succeeds and the dlopen failure
+    only surfaces at first ``transcribe()`` call.
+
+    We try ``auto`` first (fast CUDA path when it works), and on any CUDA
+    library load failure fall back to CPU + int8.
+    """
+    from faster_whisper import WhisperModel
+    try:
+        return WhisperModel(model_name, device="auto", compute_type="auto")
+    except Exception as exc:
+        if not _looks_like_cuda_lib_error(exc):
+            raise
+        logger.warning(
+            "faster-whisper CUDA load failed (%s) — falling back to CPU (int8). "
+            "Install the NVIDIA CUDA runtime (libcublas/libcudnn) to use GPU.",
+            exc,
+        )
+        return WhisperModel(model_name, device="cpu", compute_type="int8")
+
+
 def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
     """Transcribe using faster-whisper (local, free)."""
     global _local_model, _local_model_name
@@ -321,11 +381,10 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
         return {"success": False, "transcript": "", "error": "faster-whisper not installed"}
 
     try:
-        from faster_whisper import WhisperModel
         # Lazy-load the model (downloads on first use, ~150 MB for 'base')
         if _local_model is None or _local_model_name != model_name:
             logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name)
-            _local_model = WhisperModel(model_name, device="auto", compute_type="auto")
+            _local_model = _load_local_whisper_model(model_name)
             _local_model_name = model_name
 
         # Language: config.yaml (stt.local.language) > env var > auto-detect.
@@ -338,8 +397,29 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
         if _forced_lang:
             transcribe_kwargs["language"] = _forced_lang
 
-        segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
-        transcript = " ".join(segment.text.strip() for segment in segments)
+        try:
+            segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
+            transcript = " ".join(segment.text.strip() for segment in segments)
+        except Exception as exc:
+            # CUDA runtime libs sometimes only fail at dlopen-on-first-use,
+            # AFTER the model loaded successfully.  Evict the broken cached
+            # model, reload on CPU, retry once.  Without this the module-
+            # global `_local_model` is poisoned and every subsequent voice
+            # message on this process fails identically until restart.
+            if not _looks_like_cuda_lib_error(exc):
+                raise
+            logger.warning(
+                "faster-whisper CUDA runtime failed mid-transcribe (%s) — "
+                "evicting cached model and retrying on CPU (int8).",
+                exc,
+            )
+            _local_model = None
+            _local_model_name = None
+            from faster_whisper import WhisperModel
+            _local_model = WhisperModel(model_name, device="cpu", compute_type="int8")
+            _local_model_name = model_name
+            segments, info = _local_model.transcribe(file_path, **transcribe_kwargs)
+            transcript = " ".join(segment.text.strip() for segment in segments)
 
         logger.info(
             "Transcribed %s via local whisper (%s, lang=%s, %.1fs audio)",