fix(transcription): reject symlinked audio inputs (#10082)

* fix(transcription): reject symlinked audio inputs

Validation runs before provider selection, so rejecting symbolic-link paths there prevents supported-extension links from being treated as normal audio files. Use os.path.islink to avoid perturbing the existing Path.stat error path and to reject links before resolving targets.

Constraint: Keep validation platform-safe and avoid requiring symlink support where unavailable.
Rejected: Use Path.is_symlink | it consumes pathlib stat calls and broke the existing stat error regression.
Confidence: high
Scope-risk: narrow
Directive: Keep path hardening in _validate_audio_file before provider dispatch.
Tested: source venv/bin/activate && python -m pytest tests/tools/test_transcription_tools.py::TestValidateAudioFileEdgeCases -q (5 passed)
Tested: source venv/bin/activate && python -m pytest tests/tools/test_transcription_tools.py::TestValidateAudioFileEdgeCases tests/tools/test_transcription_tools.py::TestTranscribeAudioDispatch::test_invalid_file_short_circuits -q (6 passed)
Tested: source venv/bin/activate && python -m compileall tools/transcription_tools.py tests/tools/test_transcription_tools.py
Tested: git diff --check
Not-tested: Full tests/tools/test_transcription_tools.py under .[dev] only; existing faster_whisper optional dependency tests fail with ModuleNotFoundError.

* Keep transcription tests independent of optional whisper install

The transcription suite mocks faster-whisper directly, so a minimal test stub keeps the branch verifiable in environments where the optional package is not installed. This preserves the existing mock-based coverage without adding a dependency.

Constraint: faster-whisper is an optional local STT dependency and is absent from the current validation environment
Rejected: Install faster-whisper just for branch validation | would add heavyweight environment coupling outside the patch scope
Confidence: high
Scope-risk: narrow
Directive: Keep this as a test-only stub unless production import semantics change
Tested: pytest tests/tools/test_transcription_tools.py -q

---------

Co-authored-by: WuKongAI-CMU <210765158+WuKongAI-CMU@users.noreply.github.com>
This commit is contained in:
Peter 2026-05-25 08:07:45 -04:00 committed by GitHub
parent ee59ef1946
commit 95848b1cbc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 26 additions and 0 deletions

View file

@ -6,13 +6,20 @@ end-to-end dispatch. All external dependencies are mocked.
"""
import os
import sys
import struct
import subprocess
import types
import wave
from unittest.mock import MagicMock, patch
import pytest
if "faster_whisper" not in sys.modules:
faster_whisper_stub = types.ModuleType("faster_whisper")
faster_whisper_stub.WhisperModel = MagicMock(name="WhisperModel")
sys.modules["faster_whisper"] = faster_whisper_stub
# ============================================================================
# Fixtures
@ -761,6 +768,23 @@ class TestValidateAudioFileEdgeCases:
assert result is not None
assert "not a file" in result["error"]
def test_symlink_with_supported_extension_is_rejected(self, tmp_path):
if not hasattr(os, "symlink"):
pytest.skip("symlinks are not supported on this platform")
target = tmp_path / "target.txt"
target.write_bytes(b"not audio")
link = tmp_path / "linked.wav"
try:
os.symlink(target, link)
except (OSError, NotImplementedError) as exc:
pytest.skip(f"symlink creation unavailable: {exc}")
from tools.transcription_tools import _validate_audio_file
result = _validate_audio_file(str(link))
assert result is not None
assert "symbolic link" in result["error"]
def test_stat_oserror(self, tmp_path):
f = tmp_path / "test.ogg"
f.write_bytes(b"data")

View file

@ -998,6 +998,8 @@ def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]:
"""Validate the audio file. Returns an error dict or None if OK."""
audio_path = Path(file_path)
if os.path.islink(audio_path):
return {"success": False, "transcript": "", "error": f"Path is a symbolic link: {file_path}"}
if not audio_path.exists():
return {"success": False, "transcript": "", "error": f"Audio file not found: {file_path}"}
if not audio_path.is_file():