diff --git a/agent/redact.py b/agent/redact.py index 43fe046b4de..c69003fcf66 100644 --- a/agent/redact.py +++ b/agent/redact.py @@ -222,6 +222,28 @@ _DB_CONNSTR_RE = re.compile( re.IGNORECASE, ) +# Bare-token credential in a web/transport URL: ``scheme://TOKEN@host``. +# This is the ``git remote set-url origin https://PASSWORD@github.com/...`` +# shape from issue #6396 — a single opaque credential in the userinfo position +# with NO ``user:pass`` colon. It is unambiguously a secret: legitimate +# round-trip URLs (OAuth callbacks, magic links, pre-signed shares — see the +# "Web-URL redaction is intentionally OFF" note in redact_sensitive_text) carry +# their tokens in the QUERY STRING, never in bare userinfo. The colon form +# ``user:pass@`` is deliberately left to pass through (commit "pass web URLs +# through unchanged", #34029) and is NOT matched here — the token class forbids +# ``:``. DB schemes are handled by _DB_CONNSTR_RE above and excluded here. +# +# Guards against false positives: +# - 8+ char floor skips short usernames (git, admin, root, deploy, ubuntu). +# - The token class ``[^\s:@/]`` cannot cross ``/``, so an ``@`` sitting in a +# path or query (e.g. ``?q=user@example.com``) is never treated as userinfo. +_URL_BARE_TOKEN_RE = re.compile( + r"((?:https?|wss?|git|ssh|ftp|ftps|sftp)://)" # scheme + r"([^\s:@/]{8,})" # bare token (no colon/slash/@), 8+ chars + r"(@[^\s]+)", # @host... + re.IGNORECASE, +) + # JWT tokens: header.payload[.signature] — always start with "eyJ" (base64 for "{") # Matches 1-part (header only), 2-part (header.payload), and full 3-part JWTs. _JWT_RE = re.compile( @@ -564,6 +586,16 @@ def redact_sensitive_text( else: text = _DB_CONNSTR_RE.sub(lambda m: f"{m.group(1)}***{m.group(3)}", text) + # Bare-token userinfo in web/transport URLs: ``scheme://TOKEN@host``. + # The git-remote-with-embedded-password shape from #6396. Only the + # colon-less bare-token form is redacted — ``user:pass@`` and + # query-string tokens are left to pass through (see the web-URL note + # below). See _URL_BARE_TOKEN_RE for the false-positive guards. + text = _URL_BARE_TOKEN_RE.sub( + lambda m: f"{m.group(1)}{_mask_token(m.group(2))}{m.group(3)}", + text, + ) + # JWT tokens (eyJ... — base64-encoded JSON headers) if "eyJ" in text: text = _JWT_RE.sub(lambda m: _mask_token(m.group(0)), text) @@ -575,7 +607,12 @@ def redact_sensitive_text( # blanket-redacting param values by name breaks those skills mid-flow. # Known credential shapes (sk-, ghp_, JWTs, etc.) inside URLs are still # caught by _PREFIX_RE and _JWT_RE above. DB connection-string passwords - # are still caught by _DB_CONNSTR_RE. + # are still caught by _DB_CONNSTR_RE. The ONE userinfo case still redacted + # is the colon-less bare-token form ``scheme://TOKEN@host`` (#6396, handled + # by _URL_BARE_TOKEN_RE in the ``://`` block above): a bare credential in + # userinfo is never a round-trip workflow token (those live in the query + # string), so masking it can't break a skill. The ``user:pass@`` form is + # left to pass through per #34029. # Form-urlencoded bodies (only triggers on clean k=v&k=v inputs). if "&" in text and "=" in text: diff --git a/tests/agent/test_redact.py b/tests/agent/test_redact.py index c28717e5a3a..afa841cb00c 100644 --- a/tests/agent/test_redact.py +++ b/tests/agent/test_redact.py @@ -492,6 +492,79 @@ class TestWebUrlsNotRedacted: assert "dbpass" not in result +class TestBareTokenUserinfoRedaction: + """Regression tests for #6396 — a bare credential in URL userinfo + (``scheme://TOKEN@host``, no ``user:pass`` colon) is redacted. This is the + git-remote-with-embedded-password shape. The colon form ``user:pass@`` and + query-string tokens are deliberately left to pass through (#34029) so + magic-link / OAuth round-trip skills keep working — see + TestWebUrlsNotRedacted for those invariants. + """ + + def test_git_remote_bare_password_redacted(self): + """Exact bug scenario: password in a git remote URL.""" + text = ( + "git remote set-url origin " + "https://MYPASSWORDWASDISLAYEDHERE@github.com/unclehowell/FCUK.git" + ) + result = redact_sensitive_text(text) + assert "MYPASSWORDWASDISLAYEDHERE" not in result + assert "@github.com" in result + assert "unclehowell/FCUK.git" in result + + def test_ssh_bare_token_redacted(self): + text = "ssh://longtoken1234567@gitlab.com/project.git" + result = redact_sensitive_text(text) + assert "longtoken1234567" not in result + assert "@gitlab.com" in result + + def test_ftp_bare_token_redacted(self): + text = "ftp://ftptoken123456@ftp.example.com/files" + result = redact_sensitive_text(text) + assert "ftptoken123456" not in result + + def test_bare_token_with_query_redacts_token_only(self): + text = "https://abcdef1234567@host.com/path?foo=bar" + result = redact_sensitive_text(text) + assert "abcdef1234567" not in result + assert "?foo=bar" in result + + def test_user_pass_form_still_passes_through(self): + """The ``user:pass@`` colon form must NOT be redacted (#34029).""" + text = "URL: https://user:supersecretpw@host.example.com/path" + assert redact_sensitive_text(text) == text + + def test_short_username_not_redacted(self): + """Short userinfo (git, admin, deploy) below the 8-char floor passes.""" + for text in ( + "https://git@github.com/user/repo.git", + "https://admin@example.com/x", + "https://deploy@host.com/y", + ): + assert redact_sensitive_text(text) == text + + def test_email_in_path_not_redacted(self): + """An ``@`` in a path/query is not userinfo — the token class stops at + ``/``, so emails after the first slash are never treated as a credential.""" + for text in ( + "https://example.com/search?q=user@example.com", + "https://example.com/users/john@doe.com/profile", + ): + assert redact_sensitive_text(text) == text + + def test_plain_url_unchanged(self): + text = "https://github.com/user/repo.git" + assert redact_sensitive_text(text) == text + + def test_long_bare_token_preserves_head_tail(self): + token = "abcdef" + "x" * 20 + "wxyz" + text = f"https://{token}@github.com/u/r.git" + result = redact_sensitive_text(text) + assert token not in result + assert "abcdef" in result # head preserved + assert "wxyz" in result # tail preserved + + class TestFormBodyRedaction: """Form-urlencoded body redaction (k=v&k=v with no other text)."""