Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions tests/test_log_redaction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from __future__ import annotations

from tinyagentos.log_redaction import PLACEHOLDER, redact, redact_lines


class TestKeyValue:
def test_equals_form(self):
assert redact("api_key=sk-abc123def456ghi789") == f"api_key={PLACEHOLDER}"

def test_colon_form(self):
assert redact("password: hunter2secret") == f"password: {PLACEHOLDER}"

def test_json_form(self):
out = redact('{"secret": "topsecretvalue123"}')
assert "topsecretvalue123" not in out
assert PLACEHOLDER in out

def test_flag_form(self):
assert redact("--token deadbeefcafebabe01") == f"--token {PLACEHOLDER}"

def test_case_insensitive(self):
assert redact("PASSWORD=SuperSecret99") == f"PASSWORD={PLACEHOLDER}"

def test_does_not_match_substring_key(self):
# "monkey" must not trip the "key" rule.
assert redact("monkey=banana") == "monkey=banana"

def test_preserves_surrounding_text(self):
out = redact("connecting with token=abcdef123456 to host db1")
assert out == f"connecting with token={PLACEHOLDER} to host db1"


class TestBearer:
def test_header(self):
assert redact("Authorization: Bearer abc123def456ghi") == \

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SUGGESTION: Test coverage is light for the documented threat model.

For security-critical code that is described as "nothing in the logs path may bypass this", the suite is missing negative tests that would catch the most likely regressions:

  • JWT in an Authorization header without the word Bearer (e.g. Authorization: eyJhbGciOi…).
  • Base64-shaped bearer token containing +, /, = (the exact bypass flagged on log_redaction.py:55).
  • A known secret value that appears as a substring of a longer unrelated identifier (the partial-leak case flagged on log_redaction.py:116).
  • Connection string with a percent-encoded password or with @ in the userinfo.
  • Multi-line PEM block embedded in a stack trace (the test uses a clean block, but real-world logs interleave it with exception frames).
  • redact_lines called with a list containing None or empty strings.

Adding these would lock in the intended behaviour against the regression modes that are cheapest to introduce and hardest to spot in code review.


Reply with @kilocode-bot fix it to have Kilo Code address this issue.

f"Authorization: Bearer {PLACEHOLDER}"


class TestTokenShapes:
def test_openai_style(self):
assert redact("using sk-taos-abcdefghij0123456789 now").count(PLACEHOLDER) == 1
assert "abcdefghij0123456789" not in redact("sk-taos-abcdefghij0123456789")

def test_github_pat(self):
assert PLACEHOLDER in redact("ghp_EXAMPLEEXAMPLEEXAMPLEEXAMPLE00")

def test_slack(self):
assert PLACEHOLDER in redact("xoxb-EXAMPLEEXAMPLE-EXAMPLEEXAMPLETOKEN")

def test_aws_akia(self):
assert PLACEHOLDER in redact("AKIAEXAMPLEEXAMPLE00")


class TestPem:
def test_private_key_block(self):
block = (
"-----BEGIN OPENSSH PRIVATE KEY-----\n"
"b3BlbnNzaC1rZXktdjEAAAAABG5vbmU\nAAAAAAAA\n"
"-----END OPENSSH PRIVATE KEY-----"
)
out = redact(f"key material:\n{block}\ndone")
assert "b3BlbnNz" not in out
assert out.startswith("key material:")
assert out.endswith("done")


class TestConnectionString:
def test_masks_only_password(self):
out = redact("dsn=postgres://taos:s3cr3tpw@db.internal:5432/app")
assert "s3cr3tpw" not in out
assert "postgres://taos:" in out
assert "@db.internal:5432/app" in out


class TestKnownValues:
def test_literal_value_masked(self):
out = redact("the model returned plainlookingkey987", known_values=["plainlookingkey987"])
assert "plainlookingkey987" not in out
assert PLACEHOLDER in out

def test_short_known_value_ignored(self):
# too short to safely mask -> left alone (no runaway redaction)
assert redact("abc appears here", known_values=["abc"]) == "abc appears here"

def test_empty_known_values_noop(self):
assert redact("nothing sensitive here", known_values=[]) == "nothing sensitive here"

def test_none_known_values(self):
assert redact("nothing sensitive here") == "nothing sensitive here"


class TestSafety:
def test_empty_string(self):
assert redact("") == ""

def test_clean_line_untouched(self):
line = "2026-07-02 19:00:00 INFO controller ready on port 6969"
assert redact(line) == line

def test_redact_lines(self):
out = redact_lines(["password=abcdef123456", "all good here"])
assert out[0] == f"password={PLACEHOLDER}"
assert out[1] == "all good here"


class TestFoldedFindings:
"""Negative/regression tests for the review folds (base64 bearer, JWT,
partial-leak known values, PEM framing, coercion)."""

def test_base64_bearer_masked_whole(self):
# + / = must not truncate the value (AWS/OAuth2 token shape).
out = redact("Authorization: Bearer abc123+def/456ghi789=")
assert "abc123" not in out
assert out == f"Authorization: Bearer {PLACEHOLDER}"

def test_bare_jwt_masked(self):
jwt = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.SflKxwRJSMeKKF2QT4"
out = redact(f"id_token={jwt}")

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SUGGESTION: test_bare_jwt_masked does not actually exercise the new bare-JWT branch of _TOKEN_SHAPE_RE.

The input is f"id_token={jwt}". Because id_token is in _SENSITIVE_KEYS, the key-value rule fires first and the whole JWT is consumed as the KV value — the test would still pass even if the JWT branch of _TOKEN_SHAPE_RE were deleted. The whole point of the new JWT rule is to catch tokens logged without any key= prefix (e.g. a service logging received: eyJhbGciOi…); the current test does not cover that path. Add an assertion with a bare JWT and no sensitive key prefix to lock in the new behavior.


Reply with @kilocode-bot fix it to have Kilo Code address this issue.

assert "SflKxwRJSMeKKF2QT4" not in out

def test_google_api_key(self):
assert PLACEHOLDER in redact("AIzaSyA0000000000000000000000000000000X")

def test_stripe_live_key(self):
assert PLACEHOLDER in redact("sk_live_0000000000000000abcdef")

def test_known_value_masks_whole_token(self):
# secret is a prefix of a longer id: the whole token must vanish, no
# trailing leak, no garbled placeholder.
out = redact("request_id=abcdef1234560000", known_values=["abcdef123456"])
assert "0000" not in out
assert "[REDACTED]0" not in out
assert PLACEHOLDER in out

def test_pem_keeps_framing(self):
block = (
"-----BEGIN OPENSSH PRIVATE KEY-----\n"
"b3BlbnNzaC1rZXktdjEAAAAA\n"
"-----END OPENSSH PRIVATE KEY-----"
)
out = redact(block)
assert "b3BlbnNz" not in out
assert "-----BEGIN OPENSSH PRIVATE KEY-----" in out
assert "-----END OPENSSH PRIVATE KEY-----" in out

def test_redact_lines_coerces_non_str(self):
out = redact_lines(["password=abcdef123456", None, 42])
assert out[0] == f"password={PLACEHOLDER}"
assert out[1] == "None"
assert out[2] == "42"
141 changes: 141 additions & 0 deletions tinyagentos/log_redaction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""Redaction for operator-facing log output (Logs app, bug-report bundle).

Every log line that leaves the box through the system-logs API passes through
`redact()` first. The threat is a well-meaning operator copying a log bundle
into a public GitHub issue and leaking a live credential that happened to be
logged by a dependency, a stack trace, or an env dump.

Design choices:
- Pure functions, no I/O, exhaustively tested. Nothing in the logs path may
bypass this.
- Redact by PATTERN (key=value, bearer tokens, connection strings, private-key
blocks, high-entropy provider-key shapes) AND by KNOWN SECRET VALUE (the
literal values from the secrets store, so a secret logged verbatim is caught
even if it does not match a generic shape).
- Fail closed on the value side: an empty or too-short known value is ignored
rather than redacting everything.
- Never widen a match to swallow surrounding context; replace only the secret
span with a fixed placeholder so the log stays readable.
"""
from __future__ import annotations

import re

PLACEHOLDER = "[REDACTED]"

# Keys whose value must be masked when they appear as key=value / key: value /
# "key": "value" / --key value. Case-insensitive, matched as whole words so
# "monkey" does not trip "key".
_SENSITIVE_KEYS = (
"password", "passwd", "secret", "token", "api_key", "apikey", "api-key",
"access_key", "access-key", "secret_key", "secret-key", "private_key",
"private-key", "client_secret", "client-secret", "authorization", "auth",
"bearer", "session", "cookie", "credential", "credentials", "passphrase",
"id_token", "refresh_token", "access_token", "jwt", "assertion",
)
_KEY_ALT = "|".join(sorted((re.escape(k) for k in _SENSITIVE_KEYS), key=len, reverse=True))

# key = value / key: value / "key": "value". The value runs until a quote,
# whitespace, or comma; brackets/URLs are kept in the value so a token is not
# truncated mid-string (leaving a readable tail). Capped to avoid runaway
# matches on pathological single-line input.
_KV_RE = re.compile(
r'(?P<pre>["\']?(?:' + _KEY_ALT + r')["\']?\s*[:=]\s*)'
r'(?P<quote>["\']?)(?P<val>[^\s,"\']{1,4096})(?P=quote)',
re.IGNORECASE,
)

# --key value (CLI flag form, space-separated).
_FLAG_RE = re.compile(r'(?P<pre>--(?:' + _KEY_ALT + r')\s+)(?P<val>\S+)', re.IGNORECASE)

# Auth SCHEME words that legitimately follow "authorization:"; the real secret
# is the NEXT token (handled by the bearer rule), so the KV rule must not treat
# the scheme word itself as the value and stop there, leaving the token exposed.
_AUTH_SCHEMES = {"bearer", "basic", "digest", "token", "negotiate"}

# Authorization: Bearer <token> (header form). The value class includes the
# base64/base64url trailing set (+ / =) so AWS session tokens and OAuth2
# access tokens are masked whole, not truncated at the first + or /.
_BEARER_RE = re.compile(r'(?P<pre>bearer\s+)(?P<val>[A-Za-z0-9._\-+/=]{8,})', re.IGNORECASE)
Comment on lines +51 to +59

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔒 Security & Privacy | 🟠 Major | ⚡ Quick win

Non-Bearer auth schemes leak their credential.

_kv_repl deliberately returns the match unchanged when the KV value is an auth scheme word (Lines 113-114), trusting a follow-up rule to mask the real token. But only _BEARER_RE exists, so basic, digest, token, and negotiate have no follow-up. Authorization: Basic dXNlcjpwYXNzd29yZA== therefore survives every rule and the base64 user:password is logged verbatim — a direct redaction bypass for HTTP Basic auth (and GitHub-style Authorization: token …).

Broaden the header rule to cover all schemes in _AUTH_SCHEMES rather than only bearer:

🔒 Proposed fix: mask the token after any auth scheme
-# Authorization: Bearer <token>  (header form). The value class includes the
-# base64/base64url trailing set (+ / =) so AWS session tokens and OAuth2
-# access tokens are masked whole, not truncated at the first + or /.
-_BEARER_RE = re.compile(r'(?P<pre>bearer\s+)(?P<val>[A-Za-z0-9._\-+/=]{8,})', re.IGNORECASE)
+# Authorization: <scheme> <token>  (header form). Covers every scheme in
+# _AUTH_SCHEMES so Basic/Digest/Token/Negotiate credentials are masked too,
+# not just Bearer. The value class includes the base64/base64url trailing set
+# (+ / =) so AWS session tokens and OAuth2 access tokens are masked whole.
+_AUTH_TOKEN_RE = re.compile(
+    r'(?P<pre>(?:bearer|basic|digest|token|negotiate)\s+)(?P<val>[A-Za-z0-9._\-+/=]{8,})',
+    re.IGNORECASE,
+)

And update the application site:

-    text = _BEARER_RE.sub(lambda m: m.group("pre") + PLACEHOLDER, text)
+    text = _AUTH_TOKEN_RE.sub(lambda m: m.group("pre") + PLACEHOLDER, text)
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tinyagentos/log_redaction.py` around lines 51 - 59, The Authorization
redaction only handles Bearer tokens, so schemes in _AUTH_SCHEMES like basic,
digest, token, and negotiate can still leak credentials after _kv_repl leaves
the scheme word untouched. Update _BEARER_RE in tinyagentos/log_redaction.py
(and its redaction call site if needed) so the follow-up mask covers any
Authorization scheme listed in _AUTH_SCHEMES, not just bearer, while keeping the
existing scheme-detection logic in _kv_repl aligned with that behavior.


# Bare secret shapes that carry no key= prefix. JWTs (three base64url segments)
# are common in logs with no Bearer word, so they get their own rule.
_TOKEN_SHAPE_RE = re.compile(

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: No detection for JWT-style tokens or other common bare-secret shapes.

_TOKEN_SHAPE_RE only covers sk-, gh[posru]_, xox[baprs]-, and AKIA…. It misses several categories of secrets that frequently end up in logs without a key= prefix:

  • JWTs: many services log Authorization: eyJhbGciOi… (no Bearer scheme word) or emit the token as a bare value. The KV rule may catch some of these when a sensitive key precedes them, but a JWT inside a JSON body field that is not in _SENSITIVE_KEYS (e.g. id_token, refresh_token, assertion, saml, idp_token) would slip through.
  • Stripe live keys (sk_live_…), Google API keys (AIza…), Slack legacy (xoxs- is covered but xoxa-/xoxp- may over-match short identifiers), npm tokens (npm_…), PyPI tokens (pypi-…), SendGrid (SG.…), Mailgun (key-…).

At minimum, consider adding eyJ[A-Za-z0-9_\-]{10,}\.eyJ[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,} for JWTs and an AIza[0-9A-Za-z_\-]{35} rule for Google keys, plus expanding _SENSITIVE_KEYS to include id_token, refresh_token, assertion, jwt.


Reply with @kilocode-bot fix it to have Kilo Code address this issue.

r'(?:'
r'\bsk-[A-Za-z0-9._\-]{16,}' # OpenAI / sk-taos
r'|\bsk_(?:live|test)_[0-9A-Za-z]{16,}' # Stripe
r'|\bgh[posru]_[A-Za-z0-9]{20,}' # GitHub PAT
r'|\bxox[baprs]-[A-Za-z0-9\-]{10,}' # Slack
r'|\bAKIA[0-9A-Z]{16}\b' # AWS access key id
r'|\bAIza[0-9A-Za-z_\-]{35}\b' # Google API key
r'|\bnpm_[A-Za-z0-9]{20,}' # npm token
r'|\bSG\.[A-Za-z0-9_\-]{16,}\.[A-Za-z0-9_\-]{16,}' # SendGrid

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SUGGESTION: SendGrid branch is missing \b anchors and could over-match inside a longer identifier.

Every other branch in _TOKEN_SHAPE_RE is wrapped in \b…\b (or \b… at the start when the suffix already provides a boundary). SG\.[A-Za-z0-9_\-]{16,}\.[A-Za-z0-9_\-]{16,} has no \b at either end, so a string like MYAPP_SG.aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.bbbbbbbbbbbbbbbbbbbbbbbbbbbb will be redacted even though the surrounding MYAPP_ makes it clearly not a SendGrid key. Add \b at the start (between word boundary and SG) and at the end of the regex.

Suggested change
r'|\bSG\.[A-Za-z0-9_\-]{16,}\.[A-Za-z0-9_\-]{16,}' # SendGrid
r'|\bSG\.[A-Za-z0-9_\-]{16,}\.[A-Za-z0-9_\-]{16,}\b' # SendGrid

Reply with @kilocode-bot fix it to have Kilo Code address this issue.

r'|\beyJ[A-Za-z0-9_\-]{8,}\.eyJ[A-Za-z0-9_\-]{6,}\.[A-Za-z0-9_\-]{6,}' # JWT

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SUGGESTION: JWT segment character class is missing +, /, and =, which can truncate a real JWT that uses non-strict base64.

The class [A-Za-z0-9_\-] is exactly the base64url alphabet, so a spec-compliant RFC 7519 JWT is matched in full. But a number of real-world issuers (legacy OAuth2 servers, some IdPs, hand-rolled libraries) emit JWTs that use standard base64 with +, /, and = padding in the signature segment. A token like eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxIn0.abc+def/ghi= would match the header and payload but stop at the first + in the signature, leaking the tail. The rule above already widened the Bearer class to include +/= for the same reason — apply the same reasoning here.

Suggested change
r'|\beyJ[A-Za-z0-9_\-]{8,}\.eyJ[A-Za-z0-9_\-]{6,}\.[A-Za-z0-9_\-]{6,}' # JWT
r'|\beyJ[A-Za-z0-9_\-+/=]{8,}\.eyJ[A-Za-z0-9_\-+/=]{6,}\.[A-Za-z0-9_\-+/=]{6,}' # JWT

Reply with @kilocode-bot fix it to have Kilo Code address this issue.

r')'
)

# PEM private-key blocks (SSH keys materialized on deploy, TLS keys). Keep the
# BEGIN/END markers so a key logged inside a stack trace still shows WHAT was
# redacted; only the base64 body is masked.
_PEM_RE = re.compile(
r'(?P<begin>-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----).*?(?P<end>-----END [A-Z0-9 ]*PRIVATE KEY-----)',
re.DOTALL,
)

# postgres://user:pass@host, mysql://..., redis://..., amqp:// -- mask the
# password component only.
_CONN_STR_RE = re.compile(r'(?P<pre>[a-zA-Z][a-zA-Z0-9+.\-]*://[^:/\s]+:)(?P<val>[^@\s]+)(?P<post>@)')

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔒 Security & Privacy | 🟡 Minor | ⚡ Quick win

Connection strings with an empty username leak the password.

[^:/\s]+ requires at least one character before the :password@, so URLs of the form redis://:s3cretpass@host (empty user, common for Redis/AMQP) never match and the password survives. Make the userinfo prefix optional.

🔒 Proposed fix
-_CONN_STR_RE = re.compile(r'(?P<pre>[a-zA-Z][a-zA-Z0-9+.\-]*://[^:/\s]+:)(?P<val>[^@\s]+)(?P<post>@)')
+_CONN_STR_RE = re.compile(r'(?P<pre>[a-zA-Z][a-zA-Z0-9+.\-]*://[^:/\s]*:)(?P<val>[^@\s]+)(?P<post>@)')
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
_CONN_STR_RE = re.compile(r'(?P<pre>[a-zA-Z][a-zA-Z0-9+.\-]*://[^:/\s]+:)(?P<val>[^@\s]+)(?P<post>@)')
_CONN_STR_RE = re.compile(r'(?P<pre>[a-zA-Z][a-zA-Z0-9+.\-]*://[^:/\s]*:)(?P<val>[^@\s]+)(?P<post>@)')
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tinyagentos/log_redaction.py` at line 87, The connection-string redaction
regex in _CONN_STR_RE only matches userinfo when a username is present, so URLs
like redis://:password@host bypass masking. Update the pattern to make the
username portion optional while still capturing the password and @ delimiter,
and verify the redaction path in log_redaction.py continues to use _CONN_STR_RE
for these cases.


# A known secret value shorter than this is not masked: too likely to be a
# common substring and cause runaway redaction of unrelated text.
_MIN_KNOWN_VALUE_LEN = 6


def redact(text: str, known_values: "list[str] | None" = None) -> str:
"""Return `text` with credential-shaped spans replaced by PLACEHOLDER.

known_values: exact secret strings (e.g. from the secrets store) to mask
wherever they appear verbatim, in addition to the pattern rules.
"""
if not text:
return text

# Structural rules first (they anchor on keys/prefixes, least likely to
# over-match), then the bare token shapes.
text = _PEM_RE.sub(lambda m: m.group("begin") + PLACEHOLDER + m.group("end"), text)
text = _CONN_STR_RE.sub(lambda m: m.group("pre") + PLACEHOLDER + m.group("post"), text)
# Bearer BEFORE the key-value rule so "authorization: Bearer <tok>" has its
# token masked; the KV rule then leaves the bare scheme word alone.
text = _BEARER_RE.sub(lambda m: m.group("pre") + PLACEHOLDER, text)
text = _FLAG_RE.sub(lambda m: m.group("pre") + PLACEHOLDER, text)

def _kv_repl(m: "re.Match[str]") -> str:
if m.group("val").lower() in _AUTH_SCHEMES:
return m.group(0) # e.g. "authorization: Bearer" -> leave for bearer rule
return m.group("pre") + PLACEHOLDER

text = _KV_RE.sub(_kv_repl, text)
text = _TOKEN_SHAPE_RE.sub(PLACEHOLDER, text)

# Known literal secret values last: mask any that survived the shape rules
# (e.g. a plain-looking API key logged without a key= prefix). Mask the
# WHOLE surrounding non-whitespace token, not just the substring, so a
# secret that is a prefix of a longer identifier (request_id=<secret>0000)
# does not leak the trailing part or produce garbled "[REDACTED]0000"
# output. Longest first so a value containing a shorter one is fully masked.
if known_values:
for val in sorted((v for v in known_values if v), key=len, reverse=True):
if len(val) < _MIN_KNOWN_VALUE_LEN:
continue
text = re.sub(r'\S*' + re.escape(val) + r'\S*', PLACEHOLDER, text)

return text


def redact_lines(lines: "list[str]", known_values: "list[str] | None" = None) -> "list[str]":
"""Redact a list of log lines (convenience for the paged log reader).

Coerces non-string elements to str so one malformed line (e.g. a None from
a buggy upstream parser) cannot take down the whole page response.
"""
return [redact(line if isinstance(line, str) else str(line), known_values) for line in lines]
Loading