fix: sanitize .env before loading to prevent token duplication (#8908)

When .env files become corrupted (e.g. concatenated KEY=VALUE pairs on
a single line due to concurrent writes or encoding issues), both
python-dotenv and load_env() would parse the entire concatenated string
as a single value. This caused bot tokens to appear duplicated up to 8×,
triggering InvalidToken errors from the Telegram API.

Root cause: _sanitize_env_lines() — which correctly splits concatenated
lines — was only called during save_env_value() writes, not during reads.

Fix:
- load_env() now calls _sanitize_env_lines() before parsing
- env_loader.load_hermes_dotenv() sanitizes the .env file on disk
  before python-dotenv reads it, so os.getenv() also returns clean values
- Added tests reproducing the exact corruption pattern from #8908

Closes #8908
This commit is contained in:
Mil Wang (from Dev Box) 2026-04-13 18:41:12 +08:00 committed by Teknium
parent e77f135ed8
commit e469f3f3db
3 changed files with 159 additions and 6 deletions

View file

@ -0,0 +1,94 @@
"""Tests for .env sanitization during load to prevent token duplication (#8908)."""
import os
import tempfile
from pathlib import Path
from unittest.mock import patch
import pytest
def test_load_env_sanitizes_concatenated_lines():
"""Verify load_env() splits concatenated KEY=VALUE pairs.
Reproduces the scenario from #8908 where a corrupted .env file
contained multiple tokens on a single line, causing the bot token
to be duplicated 8 times.
"""
from hermes_cli.config import load_env
token = "8356550917:AAGGEkzg06Hrc3Hjb3Sa1jkGVDOdU_lYy2Q"
# Simulate concatenated line: TOKEN=xxx followed immediately by another key
corrupted = f"TELEGRAM_BOT_TOKEN={token}ANTHROPIC_API_KEY=sk-ant-test123\n"
with tempfile.NamedTemporaryFile(
mode="w", suffix=".env", delete=False, encoding="utf-8"
) as f:
f.write(corrupted)
env_path = Path(f.name)
try:
with patch("hermes_cli.config.get_env_path", return_value=env_path):
result = load_env()
assert result.get("TELEGRAM_BOT_TOKEN") == token, (
f"Token should be exactly '{token}', got '{result.get('TELEGRAM_BOT_TOKEN')}'"
)
assert result.get("ANTHROPIC_API_KEY") == "sk-ant-test123"
finally:
env_path.unlink(missing_ok=True)
def test_load_env_normal_file_unchanged():
"""A well-formed .env file should be parsed identically."""
from hermes_cli.config import load_env
content = (
"TELEGRAM_BOT_TOKEN=mytoken123\n"
"ANTHROPIC_API_KEY=sk-ant-key\n"
"# comment\n"
"\n"
"OPENAI_API_KEY=sk-openai\n"
)
with tempfile.NamedTemporaryFile(
mode="w", suffix=".env", delete=False, encoding="utf-8"
) as f:
f.write(content)
env_path = Path(f.name)
try:
with patch("hermes_cli.config.get_env_path", return_value=env_path):
result = load_env()
assert result["TELEGRAM_BOT_TOKEN"] == "mytoken123"
assert result["ANTHROPIC_API_KEY"] == "sk-ant-key"
assert result["OPENAI_API_KEY"] == "sk-openai"
finally:
env_path.unlink(missing_ok=True)
def test_env_loader_sanitizes_before_dotenv():
"""Verify env_loader._sanitize_env_file_if_needed fixes corrupted files."""
from hermes_cli.env_loader import _sanitize_env_file_if_needed
token = "8356550917:AAGGEkzg06Hrc3Hjb3Sa1jkGVDOdU_lYy2Q"
corrupted = f"TELEGRAM_BOT_TOKEN={token}ANTHROPIC_API_KEY=sk-ant-test\n"
with tempfile.NamedTemporaryFile(
mode="w", suffix=".env", delete=False, encoding="utf-8"
) as f:
f.write(corrupted)
env_path = Path(f.name)
try:
_sanitize_env_file_if_needed(env_path)
with open(env_path, encoding="utf-8") as f:
lines = f.readlines()
# Should be split into two separate lines
assert len(lines) == 2, f"Expected 2 lines, got {len(lines)}: {lines}"
assert lines[0].startswith("TELEGRAM_BOT_TOKEN=")
assert lines[1].startswith("ANTHROPIC_API_KEY=")
# Token should not contain the second key
parsed_token = lines[0].strip().split("=", 1)[1]
assert parsed_token == token
finally:
env_path.unlink(missing_ok=True)