fix: detect and strip non-ASCII characters from API keys (#6843)

API keys containing Unicode lookalike characters (e.g. ʋ U+028B instead
of v) cause UnicodeEncodeError when httpx encodes the Authorization
header as ASCII.  This commonly happens when users copy-paste keys from
PDFs, rich-text editors, or web pages with decorative fonts.

Three layers of defense:

1. **Save-time validation** (hermes_cli/config.py):
   _check_non_ascii_credential() strips non-ASCII from credential values
   when saving to .env, with a clear warning explaining the issue.

2. **Load-time sanitization** (hermes_cli/env_loader.py):
   _sanitize_loaded_credentials() strips non-ASCII from credential env
   vars (those ending in _API_KEY, _TOKEN, _SECRET, _KEY) after dotenv
   loads them, so the rest of the codebase never sees non-ASCII keys.

3. **Runtime recovery** (run_agent.py):
   The UnicodeEncodeError recovery block now also sanitizes self.api_key
   and self._client_kwargs['api_key'], fixing the gap where message/tool
   sanitization succeeded but the API key still caused httpx to fail on
   the Authorization header.

Also: hermes_logging.py RotatingFileHandler now explicitly sets
encoding='utf-8' instead of relying on locale default (defensive
hardening for ASCII-locale systems).
This commit is contained in:
Teknium 2026-04-14 17:17:15 -07:00 committed by Teknium
parent 677f1227c3
commit da528a8207
6 changed files with 206 additions and 0 deletions

View file

@ -2766,6 +2766,47 @@ def sanitize_env_file() -> int:
return fixes return fixes
def _check_non_ascii_credential(key: str, value: str) -> str:
"""Warn and strip non-ASCII characters from credential values.
API keys and tokens must be pure ASCII they are sent as HTTP header
values which httpx/httpcore encode as ASCII. Non-ASCII characters
(commonly introduced by copy-pasting from rich-text editors or PDFs
that substitute lookalike Unicode glyphs for ASCII letters) cause
``UnicodeEncodeError: 'ascii' codec can't encode character`` at
request time.
Returns the sanitized (ASCII-only) value. Prints a warning if any
non-ASCII characters were found and removed.
"""
try:
value.encode("ascii")
return value # all ASCII — nothing to do
except UnicodeEncodeError:
pass
# Build a readable list of the offending characters
bad_chars: list[str] = []
for i, ch in enumerate(value):
if ord(ch) > 127:
bad_chars.append(f" position {i}: {ch!r} (U+{ord(ch):04X})")
sanitized = value.encode("ascii", errors="ignore").decode("ascii")
import sys
print(
f"\n Warning: {key} contains non-ASCII characters that will break API requests.\n"
f" This usually happens when copy-pasting from a PDF, rich-text editor,\n"
f" or web page that substitutes lookalike Unicode glyphs for ASCII letters.\n"
f"\n"
+ "\n".join(f" {line}" for line in bad_chars[:5])
+ ("\n ... and more" if len(bad_chars) > 5 else "")
+ f"\n\n The non-ASCII characters have been stripped automatically.\n"
f" If authentication fails, re-copy the key from the provider's dashboard.\n",
file=sys.stderr,
)
return sanitized
def save_env_value(key: str, value: str): def save_env_value(key: str, value: str):
"""Save or update a value in ~/.hermes/.env.""" """Save or update a value in ~/.hermes/.env."""
if is_managed(): if is_managed():
@ -2774,6 +2815,8 @@ def save_env_value(key: str, value: str):
if not _ENV_VAR_NAME_RE.match(key): if not _ENV_VAR_NAME_RE.match(key):
raise ValueError(f"Invalid environment variable name: {key!r}") raise ValueError(f"Invalid environment variable name: {key!r}")
value = value.replace("\n", "").replace("\r", "") value = value.replace("\n", "").replace("\r", "")
# API keys / tokens must be ASCII — strip non-ASCII with a warning.
value = _check_non_ascii_credential(key, value)
ensure_hermes_home() ensure_hermes_home()
env_path = get_env_path() env_path = get_env_path()

View file

@ -8,11 +8,40 @@ from pathlib import Path
from dotenv import load_dotenv from dotenv import load_dotenv
# Env var name suffixes that indicate credential values. These are the
# only env vars whose values we sanitize on load — we must not silently
# alter arbitrary user env vars, but credentials are known to require
# pure ASCII (they become HTTP header values).
_CREDENTIAL_SUFFIXES = ("_API_KEY", "_TOKEN", "_SECRET", "_KEY")
def _sanitize_loaded_credentials() -> None:
"""Strip non-ASCII characters from credential env vars in os.environ.
Called after dotenv loads so the rest of the codebase never sees
non-ASCII API keys. Only touches env vars whose names end with
known credential suffixes (``_API_KEY``, ``_TOKEN``, etc.).
"""
for key, value in list(os.environ.items()):
if not any(key.endswith(suffix) for suffix in _CREDENTIAL_SUFFIXES):
continue
try:
value.encode("ascii")
except UnicodeEncodeError:
os.environ[key] = value.encode("ascii", errors="ignore").decode("ascii")
def _load_dotenv_with_fallback(path: Path, *, override: bool) -> None: def _load_dotenv_with_fallback(path: Path, *, override: bool) -> None:
try: try:
load_dotenv(dotenv_path=path, override=override, encoding="utf-8") load_dotenv(dotenv_path=path, override=override, encoding="utf-8")
except UnicodeDecodeError: except UnicodeDecodeError:
load_dotenv(dotenv_path=path, override=override, encoding="latin-1") load_dotenv(dotenv_path=path, override=override, encoding="latin-1")
# Strip non-ASCII characters from credential env vars that were just
# loaded. API keys must be pure ASCII since they're sent as HTTP
# header values (httpx encodes headers as ASCII). Non-ASCII chars
# typically come from copy-pasting keys from PDFs or rich-text editors
# that substitute Unicode lookalike glyphs (e.g. ʋ U+028B for v).
_sanitize_loaded_credentials()
def _sanitize_env_file_if_needed(path: Path) -> None: def _sanitize_env_file_if_needed(path: Path) -> None:

View file

@ -358,6 +358,7 @@ def _add_rotating_handler(
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
handler = _ManagedRotatingFileHandler( handler = _ManagedRotatingFileHandler(
str(path), maxBytes=max_bytes, backupCount=backup_count, str(path), maxBytes=max_bytes, backupCount=backup_count,
encoding="utf-8",
) )
handler.setLevel(level) handler.setLevel(level)
handler.setFormatter(formatter) handler.setFormatter(formatter)

View file

@ -8987,12 +8987,35 @@ class AIAgent:
if isinstance(_default_headers, dict): if isinstance(_default_headers, dict):
_headers_sanitized = _sanitize_structure_non_ascii(_default_headers) _headers_sanitized = _sanitize_structure_non_ascii(_default_headers)
# Sanitize the API key — non-ASCII characters in
# credentials (e.g. ʋ instead of v from a bad
# copy-paste) cause httpx to fail when encoding
# the Authorization header as ASCII. This is the
# most common cause of persistent UnicodeEncodeError
# that survives message/tool sanitization (#6843).
_credential_sanitized = False
_raw_key = getattr(self, "api_key", None) or ""
if _raw_key:
_clean_key = _strip_non_ascii(_raw_key)
if _clean_key != _raw_key:
self.api_key = _clean_key
if isinstance(getattr(self, "_client_kwargs", None), dict):
self._client_kwargs["api_key"] = _clean_key
_credential_sanitized = True
self._vprint(
f"{self.log_prefix}⚠️ API key contained non-ASCII characters "
f"(bad copy-paste?) — stripped them. If auth fails, "
f"re-copy the key from your provider's dashboard.",
force=True,
)
if ( if (
_messages_sanitized _messages_sanitized
or _prefill_sanitized or _prefill_sanitized
or _tools_sanitized or _tools_sanitized
or _system_sanitized or _system_sanitized
or _headers_sanitized or _headers_sanitized
or _credential_sanitized
): ):
self._unicode_sanitization_passes += 1 self._unicode_sanitization_passes += 1
self._vprint( self._vprint(

View file

@ -0,0 +1,83 @@
"""Tests for non-ASCII credential detection and sanitization.
Covers the fix for issue #6843 — API keys containing Unicode lookalike
characters (e.g. ʋ U+028B instead of v) cause UnicodeEncodeError when
httpx tries to encode the Authorization header as ASCII.
"""
import os
import sys
import tempfile
import pytest
from hermes_cli.config import _check_non_ascii_credential
class TestCheckNonAsciiCredential:
"""Tests for _check_non_ascii_credential()."""
def test_ascii_key_unchanged(self):
key = "sk-proj-" + "a" * 100
result = _check_non_ascii_credential("TEST_API_KEY", key)
assert result == key
def test_strips_unicode_v_lookalike(self, capsys):
"""The exact scenario from issue #6843: ʋ instead of v."""
key = "sk-proj-abc" + "ʋ" + "def" # \u028b
result = _check_non_ascii_credential("OPENROUTER_API_KEY", key)
assert result == "sk-proj-abcdef"
assert "ʋ" not in result
# Should print a warning
captured = capsys.readouterr()
assert "non-ASCII" in captured.err
def test_strips_multiple_non_ascii(self, capsys):
key = "sk-proj-aʋbécd"
result = _check_non_ascii_credential("OPENAI_API_KEY", key)
assert result == "sk-proj-abcd"
captured = capsys.readouterr()
assert "U+028B" in captured.err # reports the char
def test_empty_key(self):
result = _check_non_ascii_credential("TEST_KEY", "")
assert result == ""
def test_all_ascii_no_warning(self, capsys):
result = _check_non_ascii_credential("KEY", "all-ascii-value-123")
assert result == "all-ascii-value-123"
captured = capsys.readouterr()
assert captured.err == ""
class TestEnvLoaderSanitization:
"""Tests for _sanitize_loaded_credentials in env_loader."""
def test_strips_non_ascii_from_api_key(self, monkeypatch):
from hermes_cli.env_loader import _sanitize_loaded_credentials
monkeypatch.setenv("OPENROUTER_API_KEY", "sk-proj-abcʋdef")
_sanitize_loaded_credentials()
assert os.environ["OPENROUTER_API_KEY"] == "sk-proj-abcdef"
def test_strips_non_ascii_from_token(self, monkeypatch):
from hermes_cli.env_loader import _sanitize_loaded_credentials
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tokénvalue")
_sanitize_loaded_credentials()
assert os.environ["DISCORD_BOT_TOKEN"] == "toknvalue"
def test_ignores_non_credential_vars(self, monkeypatch):
from hermes_cli.env_loader import _sanitize_loaded_credentials
monkeypatch.setenv("MY_UNICODE_VAR", "héllo wörld")
_sanitize_loaded_credentials()
# Not a credential suffix — should be left alone
assert os.environ["MY_UNICODE_VAR"] == "héllo wörld"
def test_ascii_credentials_untouched(self, monkeypatch):
from hermes_cli.env_loader import _sanitize_loaded_credentials
monkeypatch.setenv("OPENAI_API_KEY", "sk-proj-allascii123")
_sanitize_loaded_credentials()
assert os.environ["OPENAI_API_KEY"] == "sk-proj-allascii123"

View file

@ -142,6 +142,33 @@ class TestSurrogateVsAsciiSanitization:
assert _sanitize_messages_surrogates(messages) is False assert _sanitize_messages_surrogates(messages) is False
class TestApiKeyNonAsciiSanitization:
"""Tests for API key sanitization in the UnicodeEncodeError recovery.
Covers the root cause of issue #6843: a non-ASCII character (ʋ U+028B)
in the API key causes httpx to fail when encoding the Authorization
header as ASCII. The recovery block must strip non-ASCII from the key.
"""
def test_strip_non_ascii_from_api_key(self):
"""_strip_non_ascii removes ʋ from an API key string."""
key = "sk-proj-abc" + "ʋ" + "def"
assert _strip_non_ascii(key) == "sk-proj-abcdef"
def test_api_key_at_position_153(self):
"""Reproduce the exact error: ʋ at position 153 in 'Bearer <key>'."""
key = "sk-proj-" + "a" * 138 + "ʋ" + "bcd"
auth_value = f"Bearer {key}"
# This is what httpx does — and it fails:
with pytest.raises(UnicodeEncodeError) as exc_info:
auth_value.encode("ascii")
assert exc_info.value.start == 153
# After sanitization, it should work:
sanitized_key = _strip_non_ascii(key)
sanitized_auth = f"Bearer {sanitized_key}"
sanitized_auth.encode("ascii") # should not raise
class TestSanitizeToolsNonAscii: class TestSanitizeToolsNonAscii:
"""Tests for _sanitize_tools_non_ascii.""" """Tests for _sanitize_tools_non_ascii."""