mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: detect and strip non-ASCII characters from API keys (#6843)
API keys containing Unicode lookalike characters (e.g. ʋ U+028B instead of v) cause UnicodeEncodeError when httpx encodes the Authorization header as ASCII. This commonly happens when users copy-paste keys from PDFs, rich-text editors, or web pages with decorative fonts. Three layers of defense: 1. **Save-time validation** (hermes_cli/config.py): _check_non_ascii_credential() strips non-ASCII from credential values when saving to .env, with a clear warning explaining the issue. 2. **Load-time sanitization** (hermes_cli/env_loader.py): _sanitize_loaded_credentials() strips non-ASCII from credential env vars (those ending in _API_KEY, _TOKEN, _SECRET, _KEY) after dotenv loads them, so the rest of the codebase never sees non-ASCII keys. 3. **Runtime recovery** (run_agent.py): The UnicodeEncodeError recovery block now also sanitizes self.api_key and self._client_kwargs['api_key'], fixing the gap where message/tool sanitization succeeded but the API key still caused httpx to fail on the Authorization header. Also: hermes_logging.py RotatingFileHandler now explicitly sets encoding='utf-8' instead of relying on locale default (defensive hardening for ASCII-locale systems).
This commit is contained in:
parent
677f1227c3
commit
da528a8207
6 changed files with 206 additions and 0 deletions
|
|
@ -2766,6 +2766,47 @@ def sanitize_env_file() -> int:
|
||||||
return fixes
|
return fixes
|
||||||
|
|
||||||
|
|
||||||
|
def _check_non_ascii_credential(key: str, value: str) -> str:
|
||||||
|
"""Warn and strip non-ASCII characters from credential values.
|
||||||
|
|
||||||
|
API keys and tokens must be pure ASCII — they are sent as HTTP header
|
||||||
|
values which httpx/httpcore encode as ASCII. Non-ASCII characters
|
||||||
|
(commonly introduced by copy-pasting from rich-text editors or PDFs
|
||||||
|
that substitute lookalike Unicode glyphs for ASCII letters) cause
|
||||||
|
``UnicodeEncodeError: 'ascii' codec can't encode character`` at
|
||||||
|
request time.
|
||||||
|
|
||||||
|
Returns the sanitized (ASCII-only) value. Prints a warning if any
|
||||||
|
non-ASCII characters were found and removed.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
value.encode("ascii")
|
||||||
|
return value # all ASCII — nothing to do
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Build a readable list of the offending characters
|
||||||
|
bad_chars: list[str] = []
|
||||||
|
for i, ch in enumerate(value):
|
||||||
|
if ord(ch) > 127:
|
||||||
|
bad_chars.append(f" position {i}: {ch!r} (U+{ord(ch):04X})")
|
||||||
|
sanitized = value.encode("ascii", errors="ignore").decode("ascii")
|
||||||
|
|
||||||
|
import sys
|
||||||
|
print(
|
||||||
|
f"\n Warning: {key} contains non-ASCII characters that will break API requests.\n"
|
||||||
|
f" This usually happens when copy-pasting from a PDF, rich-text editor,\n"
|
||||||
|
f" or web page that substitutes lookalike Unicode glyphs for ASCII letters.\n"
|
||||||
|
f"\n"
|
||||||
|
+ "\n".join(f" {line}" for line in bad_chars[:5])
|
||||||
|
+ ("\n ... and more" if len(bad_chars) > 5 else "")
|
||||||
|
+ f"\n\n The non-ASCII characters have been stripped automatically.\n"
|
||||||
|
f" If authentication fails, re-copy the key from the provider's dashboard.\n",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
return sanitized
|
||||||
|
|
||||||
|
|
||||||
def save_env_value(key: str, value: str):
|
def save_env_value(key: str, value: str):
|
||||||
"""Save or update a value in ~/.hermes/.env."""
|
"""Save or update a value in ~/.hermes/.env."""
|
||||||
if is_managed():
|
if is_managed():
|
||||||
|
|
@ -2774,6 +2815,8 @@ def save_env_value(key: str, value: str):
|
||||||
if not _ENV_VAR_NAME_RE.match(key):
|
if not _ENV_VAR_NAME_RE.match(key):
|
||||||
raise ValueError(f"Invalid environment variable name: {key!r}")
|
raise ValueError(f"Invalid environment variable name: {key!r}")
|
||||||
value = value.replace("\n", "").replace("\r", "")
|
value = value.replace("\n", "").replace("\r", "")
|
||||||
|
# API keys / tokens must be ASCII — strip non-ASCII with a warning.
|
||||||
|
value = _check_non_ascii_credential(key, value)
|
||||||
ensure_hermes_home()
|
ensure_hermes_home()
|
||||||
env_path = get_env_path()
|
env_path = get_env_path()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,40 @@ from pathlib import Path
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
|
||||||
|
# Env var name suffixes that indicate credential values. These are the
|
||||||
|
# only env vars whose values we sanitize on load — we must not silently
|
||||||
|
# alter arbitrary user env vars, but credentials are known to require
|
||||||
|
# pure ASCII (they become HTTP header values).
|
||||||
|
_CREDENTIAL_SUFFIXES = ("_API_KEY", "_TOKEN", "_SECRET", "_KEY")
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_loaded_credentials() -> None:
|
||||||
|
"""Strip non-ASCII characters from credential env vars in os.environ.
|
||||||
|
|
||||||
|
Called after dotenv loads so the rest of the codebase never sees
|
||||||
|
non-ASCII API keys. Only touches env vars whose names end with
|
||||||
|
known credential suffixes (``_API_KEY``, ``_TOKEN``, etc.).
|
||||||
|
"""
|
||||||
|
for key, value in list(os.environ.items()):
|
||||||
|
if not any(key.endswith(suffix) for suffix in _CREDENTIAL_SUFFIXES):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
value.encode("ascii")
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
os.environ[key] = value.encode("ascii", errors="ignore").decode("ascii")
|
||||||
|
|
||||||
|
|
||||||
def _load_dotenv_with_fallback(path: Path, *, override: bool) -> None:
|
def _load_dotenv_with_fallback(path: Path, *, override: bool) -> None:
|
||||||
try:
|
try:
|
||||||
load_dotenv(dotenv_path=path, override=override, encoding="utf-8")
|
load_dotenv(dotenv_path=path, override=override, encoding="utf-8")
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
load_dotenv(dotenv_path=path, override=override, encoding="latin-1")
|
load_dotenv(dotenv_path=path, override=override, encoding="latin-1")
|
||||||
|
# Strip non-ASCII characters from credential env vars that were just
|
||||||
|
# loaded. API keys must be pure ASCII since they're sent as HTTP
|
||||||
|
# header values (httpx encodes headers as ASCII). Non-ASCII chars
|
||||||
|
# typically come from copy-pasting keys from PDFs or rich-text editors
|
||||||
|
# that substitute Unicode lookalike glyphs (e.g. ʋ U+028B for v).
|
||||||
|
_sanitize_loaded_credentials()
|
||||||
|
|
||||||
|
|
||||||
def _sanitize_env_file_if_needed(path: Path) -> None:
|
def _sanitize_env_file_if_needed(path: Path) -> None:
|
||||||
|
|
|
||||||
|
|
@ -358,6 +358,7 @@ def _add_rotating_handler(
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
handler = _ManagedRotatingFileHandler(
|
handler = _ManagedRotatingFileHandler(
|
||||||
str(path), maxBytes=max_bytes, backupCount=backup_count,
|
str(path), maxBytes=max_bytes, backupCount=backup_count,
|
||||||
|
encoding="utf-8",
|
||||||
)
|
)
|
||||||
handler.setLevel(level)
|
handler.setLevel(level)
|
||||||
handler.setFormatter(formatter)
|
handler.setFormatter(formatter)
|
||||||
|
|
|
||||||
23
run_agent.py
23
run_agent.py
|
|
@ -8987,12 +8987,35 @@ class AIAgent:
|
||||||
if isinstance(_default_headers, dict):
|
if isinstance(_default_headers, dict):
|
||||||
_headers_sanitized = _sanitize_structure_non_ascii(_default_headers)
|
_headers_sanitized = _sanitize_structure_non_ascii(_default_headers)
|
||||||
|
|
||||||
|
# Sanitize the API key — non-ASCII characters in
|
||||||
|
# credentials (e.g. ʋ instead of v from a bad
|
||||||
|
# copy-paste) cause httpx to fail when encoding
|
||||||
|
# the Authorization header as ASCII. This is the
|
||||||
|
# most common cause of persistent UnicodeEncodeError
|
||||||
|
# that survives message/tool sanitization (#6843).
|
||||||
|
_credential_sanitized = False
|
||||||
|
_raw_key = getattr(self, "api_key", None) or ""
|
||||||
|
if _raw_key:
|
||||||
|
_clean_key = _strip_non_ascii(_raw_key)
|
||||||
|
if _clean_key != _raw_key:
|
||||||
|
self.api_key = _clean_key
|
||||||
|
if isinstance(getattr(self, "_client_kwargs", None), dict):
|
||||||
|
self._client_kwargs["api_key"] = _clean_key
|
||||||
|
_credential_sanitized = True
|
||||||
|
self._vprint(
|
||||||
|
f"{self.log_prefix}⚠️ API key contained non-ASCII characters "
|
||||||
|
f"(bad copy-paste?) — stripped them. If auth fails, "
|
||||||
|
f"re-copy the key from your provider's dashboard.",
|
||||||
|
force=True,
|
||||||
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
_messages_sanitized
|
_messages_sanitized
|
||||||
or _prefill_sanitized
|
or _prefill_sanitized
|
||||||
or _tools_sanitized
|
or _tools_sanitized
|
||||||
or _system_sanitized
|
or _system_sanitized
|
||||||
or _headers_sanitized
|
or _headers_sanitized
|
||||||
|
or _credential_sanitized
|
||||||
):
|
):
|
||||||
self._unicode_sanitization_passes += 1
|
self._unicode_sanitization_passes += 1
|
||||||
self._vprint(
|
self._vprint(
|
||||||
|
|
|
||||||
83
tests/hermes_cli/test_non_ascii_credential.py
Normal file
83
tests/hermes_cli/test_non_ascii_credential.py
Normal file
|
|
@ -0,0 +1,83 @@
|
||||||
|
"""Tests for non-ASCII credential detection and sanitization.
|
||||||
|
|
||||||
|
Covers the fix for issue #6843 — API keys containing Unicode lookalike
|
||||||
|
characters (e.g. ʋ U+028B instead of v) cause UnicodeEncodeError when
|
||||||
|
httpx tries to encode the Authorization header as ASCII.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from hermes_cli.config import _check_non_ascii_credential
|
||||||
|
|
||||||
|
|
||||||
|
class TestCheckNonAsciiCredential:
|
||||||
|
"""Tests for _check_non_ascii_credential()."""
|
||||||
|
|
||||||
|
def test_ascii_key_unchanged(self):
|
||||||
|
key = "sk-proj-" + "a" * 100
|
||||||
|
result = _check_non_ascii_credential("TEST_API_KEY", key)
|
||||||
|
assert result == key
|
||||||
|
|
||||||
|
def test_strips_unicode_v_lookalike(self, capsys):
|
||||||
|
"""The exact scenario from issue #6843: ʋ instead of v."""
|
||||||
|
key = "sk-proj-abc" + "ʋ" + "def" # \u028b
|
||||||
|
result = _check_non_ascii_credential("OPENROUTER_API_KEY", key)
|
||||||
|
assert result == "sk-proj-abcdef"
|
||||||
|
assert "ʋ" not in result
|
||||||
|
# Should print a warning
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "non-ASCII" in captured.err
|
||||||
|
|
||||||
|
def test_strips_multiple_non_ascii(self, capsys):
|
||||||
|
key = "sk-proj-aʋbécd"
|
||||||
|
result = _check_non_ascii_credential("OPENAI_API_KEY", key)
|
||||||
|
assert result == "sk-proj-abcd"
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "U+028B" in captured.err # reports the char
|
||||||
|
|
||||||
|
def test_empty_key(self):
|
||||||
|
result = _check_non_ascii_credential("TEST_KEY", "")
|
||||||
|
assert result == ""
|
||||||
|
|
||||||
|
def test_all_ascii_no_warning(self, capsys):
|
||||||
|
result = _check_non_ascii_credential("KEY", "all-ascii-value-123")
|
||||||
|
assert result == "all-ascii-value-123"
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert captured.err == ""
|
||||||
|
|
||||||
|
|
||||||
|
class TestEnvLoaderSanitization:
|
||||||
|
"""Tests for _sanitize_loaded_credentials in env_loader."""
|
||||||
|
|
||||||
|
def test_strips_non_ascii_from_api_key(self, monkeypatch):
|
||||||
|
from hermes_cli.env_loader import _sanitize_loaded_credentials
|
||||||
|
|
||||||
|
monkeypatch.setenv("OPENROUTER_API_KEY", "sk-proj-abcʋdef")
|
||||||
|
_sanitize_loaded_credentials()
|
||||||
|
assert os.environ["OPENROUTER_API_KEY"] == "sk-proj-abcdef"
|
||||||
|
|
||||||
|
def test_strips_non_ascii_from_token(self, monkeypatch):
|
||||||
|
from hermes_cli.env_loader import _sanitize_loaded_credentials
|
||||||
|
|
||||||
|
monkeypatch.setenv("DISCORD_BOT_TOKEN", "tokénvalue")
|
||||||
|
_sanitize_loaded_credentials()
|
||||||
|
assert os.environ["DISCORD_BOT_TOKEN"] == "toknvalue"
|
||||||
|
|
||||||
|
def test_ignores_non_credential_vars(self, monkeypatch):
|
||||||
|
from hermes_cli.env_loader import _sanitize_loaded_credentials
|
||||||
|
|
||||||
|
monkeypatch.setenv("MY_UNICODE_VAR", "héllo wörld")
|
||||||
|
_sanitize_loaded_credentials()
|
||||||
|
# Not a credential suffix — should be left alone
|
||||||
|
assert os.environ["MY_UNICODE_VAR"] == "héllo wörld"
|
||||||
|
|
||||||
|
def test_ascii_credentials_untouched(self, monkeypatch):
|
||||||
|
from hermes_cli.env_loader import _sanitize_loaded_credentials
|
||||||
|
|
||||||
|
monkeypatch.setenv("OPENAI_API_KEY", "sk-proj-allascii123")
|
||||||
|
_sanitize_loaded_credentials()
|
||||||
|
assert os.environ["OPENAI_API_KEY"] == "sk-proj-allascii123"
|
||||||
|
|
@ -142,6 +142,33 @@ class TestSurrogateVsAsciiSanitization:
|
||||||
assert _sanitize_messages_surrogates(messages) is False
|
assert _sanitize_messages_surrogates(messages) is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestApiKeyNonAsciiSanitization:
|
||||||
|
"""Tests for API key sanitization in the UnicodeEncodeError recovery.
|
||||||
|
|
||||||
|
Covers the root cause of issue #6843: a non-ASCII character (ʋ U+028B)
|
||||||
|
in the API key causes httpx to fail when encoding the Authorization
|
||||||
|
header as ASCII. The recovery block must strip non-ASCII from the key.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_strip_non_ascii_from_api_key(self):
|
||||||
|
"""_strip_non_ascii removes ʋ from an API key string."""
|
||||||
|
key = "sk-proj-abc" + "ʋ" + "def"
|
||||||
|
assert _strip_non_ascii(key) == "sk-proj-abcdef"
|
||||||
|
|
||||||
|
def test_api_key_at_position_153(self):
|
||||||
|
"""Reproduce the exact error: ʋ at position 153 in 'Bearer <key>'."""
|
||||||
|
key = "sk-proj-" + "a" * 138 + "ʋ" + "bcd"
|
||||||
|
auth_value = f"Bearer {key}"
|
||||||
|
# This is what httpx does — and it fails:
|
||||||
|
with pytest.raises(UnicodeEncodeError) as exc_info:
|
||||||
|
auth_value.encode("ascii")
|
||||||
|
assert exc_info.value.start == 153
|
||||||
|
# After sanitization, it should work:
|
||||||
|
sanitized_key = _strip_non_ascii(key)
|
||||||
|
sanitized_auth = f"Bearer {sanitized_key}"
|
||||||
|
sanitized_auth.encode("ascii") # should not raise
|
||||||
|
|
||||||
|
|
||||||
class TestSanitizeToolsNonAscii:
|
class TestSanitizeToolsNonAscii:
|
||||||
"""Tests for _sanitize_tools_non_ascii."""
|
"""Tests for _sanitize_tools_non_ascii."""
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue