Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions tests/test_log_redaction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
from __future__ import annotations

from tinyagentos.log_redaction import PLACEHOLDER, redact, redact_lines


class TestKeyValue:
def test_equals_form(self):
assert redact("api_key=sk-abc123def456ghi789") == f"api_key={PLACEHOLDER}"

def test_colon_form(self):
assert redact("password: hunter2secret") == f"password: {PLACEHOLDER}"

def test_json_form(self):
out = redact('{"secret": "topsecretvalue123"}')
assert "topsecretvalue123" not in out
assert PLACEHOLDER in out

def test_flag_form(self):
assert redact("--token deadbeefcafebabe01") == f"--token {PLACEHOLDER}"

def test_case_insensitive(self):
assert redact("PASSWORD=SuperSecret99") == f"PASSWORD={PLACEHOLDER}"

def test_does_not_match_substring_key(self):
# "monkey" must not trip the "key" rule.
assert redact("monkey=banana") == "monkey=banana"

def test_preserves_surrounding_text(self):
out = redact("connecting with token=abcdef123456 to host db1")
assert out == f"connecting with token={PLACEHOLDER} to host db1"


class TestBearer:
def test_header(self):
assert redact("Authorization: Bearer abc123def456ghi") == \

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SUGGESTION: Test coverage is light for the documented threat model.

For security-critical code that is described as "nothing in the logs path may bypass this", the suite is missing negative tests that would catch the most likely regressions:

  • JWT in an Authorization header without the word Bearer (e.g. Authorization: eyJhbGciOi…).
  • Base64-shaped bearer token containing +, /, = (the exact bypass flagged on log_redaction.py:55).
  • A known secret value that appears as a substring of a longer unrelated identifier (the partial-leak case flagged on log_redaction.py:116).
  • Connection string with a percent-encoded password or with @ in the userinfo.
  • Multi-line PEM block embedded in a stack trace (the test uses a clean block, but real-world logs interleave it with exception frames).
  • redact_lines called with a list containing None or empty strings.

Adding these would lock in the intended behaviour against the regression modes that are cheapest to introduce and hardest to spot in code review.


Reply with @kilocode-bot fix it to have Kilo Code address this issue.

f"Authorization: Bearer {PLACEHOLDER}"


class TestTokenShapes:
def test_openai_style(self):
assert redact("using sk-taos-abcdefghij0123456789 now").count(PLACEHOLDER) == 1
assert "abcdefghij0123456789" not in redact("sk-taos-abcdefghij0123456789")

def test_github_pat(self):
assert PLACEHOLDER in redact("ghp_EXAMPLEEXAMPLEEXAMPLEEXAMPLE00")

def test_slack(self):
assert PLACEHOLDER in redact("xoxb-EXAMPLEEXAMPLE-EXAMPLEEXAMPLETOKEN")

def test_aws_akia(self):
assert PLACEHOLDER in redact("AKIAEXAMPLEEXAMPLE00")


class TestPem:
def test_private_key_block(self):
block = (
"-----BEGIN OPENSSH PRIVATE KEY-----\n"
"b3BlbnNzaC1rZXktdjEAAAAABG5vbmU\nAAAAAAAA\n"
"-----END OPENSSH PRIVATE KEY-----"
)
out = redact(f"key material:\n{block}\ndone")
assert "b3BlbnNz" not in out
assert out.startswith("key material:")
assert out.endswith("done")


class TestConnectionString:
def test_masks_only_password(self):
out = redact("dsn=postgres://taos:s3cr3tpw@db.internal:5432/app")
assert "s3cr3tpw" not in out
assert "postgres://taos:" in out
assert "@db.internal:5432/app" in out


class TestKnownValues:
def test_literal_value_masked(self):
out = redact("the model returned plainlookingkey987", known_values=["plainlookingkey987"])
assert "plainlookingkey987" not in out
assert PLACEHOLDER in out

def test_short_known_value_ignored(self):
# too short to safely mask -> left alone (no runaway redaction)
assert redact("abc appears here", known_values=["abc"]) == "abc appears here"

def test_empty_known_values_noop(self):
assert redact("nothing sensitive here", known_values=[]) == "nothing sensitive here"

def test_none_known_values(self):
assert redact("nothing sensitive here") == "nothing sensitive here"


class TestSafety:
def test_empty_string(self):
assert redact("") == ""

def test_clean_line_untouched(self):
line = "2026-07-02 19:00:00 INFO controller ready on port 6969"
assert redact(line) == line

def test_redact_lines(self):
out = redact_lines(["password=abcdef123456", "all good here"])
assert out[0] == f"password={PLACEHOLDER}"
assert out[1] == "all good here"
123 changes: 123 additions & 0 deletions tinyagentos/log_redaction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Redaction for operator-facing log output (Logs app, bug-report bundle).

Every log line that leaves the box through the system-logs API passes through
`redact()` first. The threat is a well-meaning operator copying a log bundle
into a public GitHub issue and leaking a live credential that happened to be
logged by a dependency, a stack trace, or an env dump.

Design choices:
- Pure functions, no I/O, exhaustively tested. Nothing in the logs path may
bypass this.
- Redact by PATTERN (key=value, bearer tokens, connection strings, private-key
blocks, high-entropy provider-key shapes) AND by KNOWN SECRET VALUE (the
literal values from the secrets store, so a secret logged verbatim is caught
even if it does not match a generic shape).
- Fail closed on the value side: an empty or too-short known value is ignored
rather than redacting everything.
- Never widen a match to swallow surrounding context; replace only the secret
span with a fixed placeholder so the log stays readable.
"""
from __future__ import annotations

import re

PLACEHOLDER = "[REDACTED]"

# Keys whose value must be masked when they appear as key=value / key: value /
# "key": "value" / --key value. Case-insensitive, matched as whole words so
# "monkey" does not trip "key".
_SENSITIVE_KEYS = (
"password", "passwd", "secret", "token", "api_key", "apikey", "api-key",
"access_key", "access-key", "secret_key", "secret-key", "private_key",
"private-key", "client_secret", "client-secret", "authorization", "auth",
"bearer", "session", "cookie", "credential", "credentials", "passphrase",
)
_KEY_ALT = "|".join(sorted((re.escape(k) for k in _SENSITIVE_KEYS), key=len, reverse=True))

# key = value / key: value / "key": "value" (value ends at quote,
# whitespace, comma, or line end).
_KV_RE = re.compile(
r'(?P<pre>["\']?(?:' + _KEY_ALT + r')["\']?\s*[:=]\s*)'
r'(?P<quote>["\']?)(?P<val>[^\s,"\'}{]+)(?P=quote)',

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SUGGESTION: _KV_RE value character class is too narrow for some legitimate secret formats.

[^\s,"'}]{]+ stops at <, >, ), ], and ]. A value such as password=<redacted-by-caller> (e.g. an HTML-escaped or shell-quoted token), a URL like redirect_uri=https://app.example.com/cb?x=1&y=2, or anything containing brackets will be truncated mid-value. While the rule itself isn't a security bypass (the start of the secret still gets masked), it means a non-empty tail of the secret can be left in the log.

Consider broadening to [^\s"'}{]+ (or even \S+) and trusting the closing quote/comma heuristics via the existing (?P=quote) group, with a length cap to avoid runaway matches on pathological input.


Reply with @kilocode-bot fix it to have Kilo Code address this issue.

re.IGNORECASE,
)

# --key value (CLI flag form, space-separated).
_FLAG_RE = re.compile(r'(?P<pre>--(?:' + _KEY_ALT + r')\s+)(?P<val>\S+)', re.IGNORECASE)

# Auth SCHEME words that legitimately follow "authorization:"; the real secret
# is the NEXT token (handled by the bearer rule), so the KV rule must not treat
# the scheme word itself as the value and stop there, leaving the token exposed.
_AUTH_SCHEMES = {"bearer", "basic", "digest", "token", "negotiate"}

# Authorization: Bearer <token> (header form; the key-value rule catches the
# "authorization=" form, this catches the header " Bearer <tok>" shape).
_BEARER_RE = re.compile(r'(?P<pre>bearer\s+)(?P<val>[A-Za-z0-9._\-]{8,})', re.IGNORECASE)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CRITICAL: Bearer-token value class misses characters that commonly appear in real tokens.

The value character class is [A-Za-z0-9._\-]{8,}. It does not allow +, /, or =, which are the trailing three characters of standard base64/base64url encodings. A Bearer header like Authorization: Bearer abc123def+456ghi= would split at the first illegal character: the value class would only see abc123def (and, depending on its length, may not even satisfy {8,}), so the token would either be partially masked or not masked at all.

This is exactly the shape used by AWS session tokens, many OAuth2 access tokens, and other application/x-www-form-urlencoded-friendly credentials. A bypass here defeats the purpose of the header form.

Suggested change
_BEARER_RE = re.compile(r'(?P<pre>bearer\s+)(?P<val>[A-Za-z0-9._\-]{8,})', re.IGNORECASE)
_BEARER_RE = re.compile(r'(?P<pre>bearer\s+)(?P<val>[A-Za-z0-9._\-+/=]{8,})', re.IGNORECASE)

Reply with @kilocode-bot fix it to have Kilo Code address this issue.


# Provider-key shapes that are secrets on their own with no key= prefix:
# sk-..., sk-taos-..., ghp_/gho_/ghs_ (GitHub), xoxb-/xoxp- (Slack), AKIA... (AWS).
_TOKEN_SHAPE_RE = re.compile(

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: No detection for JWT-style tokens or other common bare-secret shapes.

_TOKEN_SHAPE_RE only covers sk-, gh[posru]_, xox[baprs]-, and AKIA…. It misses several categories of secrets that frequently end up in logs without a key= prefix:

  • JWTs: many services log Authorization: eyJhbGciOi… (no Bearer scheme word) or emit the token as a bare value. The KV rule may catch some of these when a sensitive key precedes them, but a JWT inside a JSON body field that is not in _SENSITIVE_KEYS (e.g. id_token, refresh_token, assertion, saml, idp_token) would slip through.
  • Stripe live keys (sk_live_…), Google API keys (AIza…), Slack legacy (xoxs- is covered but xoxa-/xoxp- may over-match short identifiers), npm tokens (npm_…), PyPI tokens (pypi-…), SendGrid (SG.…), Mailgun (key-…).

At minimum, consider adding eyJ[A-Za-z0-9_\-]{10,}\.eyJ[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,} for JWTs and an AIza[0-9A-Za-z_\-]{35} rule for Google keys, plus expanding _SENSITIVE_KEYS to include id_token, refresh_token, assertion, jwt.


Reply with @kilocode-bot fix it to have Kilo Code address this issue.

r'\b(?:'
r'sk-[A-Za-z0-9._\-]{16,}'
r'|gh[posru]_[A-Za-z0-9]{20,}'
r'|xox[baprs]-[A-Za-z0-9\-]{10,}'
r'|AKIA[0-9A-Z]{16}'
r')\b'
)

# PEM private-key blocks (SSH keys materialized on deploy, TLS keys).
_PEM_RE = re.compile(
r'-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----.*?-----END [A-Z0-9 ]*PRIVATE KEY-----',

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SUGGESTION: PEM rule swallows surrounding log context, not just the key block.

_PEM_RE is replaced with a single PLACEHOLDER, but the rule's match is the entire span from -----BEGIN … to -----END … (with .*? and DOTALL). That is correct for masking, but the loss is greater than necessary when a stack trace logs the key material inline — the exception type, file, line number, and the message framing around the key are all gone. Consider preserving the framing markers and masking only the base64 body, e.g. replace -----BEGIN …-----\n.*?-----END …----- so -----BEGIN OPENSSH PRIVATE KEY----- and -----END …----- remain visible and only the bytes between them are replaced.


Reply with @kilocode-bot fix it to have Kilo Code address this issue.

re.DOTALL,
)

# postgres://user:pass@host, mysql://..., redis://..., amqp:// -- mask the
# password component only.
_CONN_STR_RE = re.compile(r'(?P<pre>[a-zA-Z][a-zA-Z0-9+.\-]*://[^:/\s]+:)(?P<val>[^@\s]+)(?P<post>@)')

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔒 Security & Privacy | 🟡 Minor | ⚡ Quick win

Connection strings with an empty username leak the password.

[^:/\s]+ requires at least one character before the :password@, so URLs of the form redis://:s3cretpass@host (empty user, common for Redis/AMQP) never match and the password survives. Make the userinfo prefix optional.

🔒 Proposed fix
-_CONN_STR_RE = re.compile(r'(?P<pre>[a-zA-Z][a-zA-Z0-9+.\-]*://[^:/\s]+:)(?P<val>[^@\s]+)(?P<post>@)')
+_CONN_STR_RE = re.compile(r'(?P<pre>[a-zA-Z][a-zA-Z0-9+.\-]*://[^:/\s]*:)(?P<val>[^@\s]+)(?P<post>@)')
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
_CONN_STR_RE = re.compile(r'(?P<pre>[a-zA-Z][a-zA-Z0-9+.\-]*://[^:/\s]+:)(?P<val>[^@\s]+)(?P<post>@)')
_CONN_STR_RE = re.compile(r'(?P<pre>[a-zA-Z][a-zA-Z0-9+.\-]*://[^:/\s]*:)(?P<val>[^@\s]+)(?P<post>@)')
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tinyagentos/log_redaction.py` at line 87, The connection-string redaction
regex in _CONN_STR_RE only matches userinfo when a username is present, so URLs
like redis://:password@host bypass masking. Update the pattern to make the
username portion optional while still capturing the password and @ delimiter,
and verify the redaction path in log_redaction.py continues to use _CONN_STR_RE
for these cases.


# A known secret value shorter than this is not masked: too likely to be a
# common substring and cause runaway redaction of unrelated text.
_MIN_KNOWN_VALUE_LEN = 6


def redact(text: str, known_values: "list[str] | None" = None) -> str:
"""Return `text` with credential-shaped spans replaced by PLACEHOLDER.

known_values: exact secret strings (e.g. from the secrets store) to mask
wherever they appear verbatim, in addition to the pattern rules.
"""
if not text:
return text

# Structural rules first (they anchor on keys/prefixes, least likely to
# over-match), then the bare token shapes.
text = _PEM_RE.sub(PLACEHOLDER, text)
text = _CONN_STR_RE.sub(lambda m: m.group("pre") + PLACEHOLDER + m.group("post"), text)
# Bearer BEFORE the key-value rule so "authorization: Bearer <tok>" has its
# token masked; the KV rule then leaves the bare scheme word alone.
text = _BEARER_RE.sub(lambda m: m.group("pre") + PLACEHOLDER, text)
text = _FLAG_RE.sub(lambda m: m.group("pre") + PLACEHOLDER, text)

def _kv_repl(m: "re.Match[str]") -> str:
if m.group("val").lower() in _AUTH_SCHEMES:
return m.group(0) # e.g. "authorization: Bearer" -> leave for bearer rule
return m.group("pre") + PLACEHOLDER

text = _KV_RE.sub(_kv_repl, text)
text = _TOKEN_SHAPE_RE.sub(PLACEHOLDER, text)

# Known literal secret values last: mask any that survived the shape rules
# (e.g. a plain-looking API key logged without a key= prefix). Longest
# first so a value that contains a shorter one is fully masked.
if known_values:
for val in sorted((v for v in known_values if v), key=len, reverse=True):
if len(val) < _MIN_KNOWN_VALUE_LEN:
continue
text = text.replace(val, PLACEHOLDER)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: known_values are replaced as raw substrings, which can corrupt unrelated text and cause partial leaks.

text.replace(val, PLACEHOLDER) will mask the secret wherever it appears, including inside longer unrelated strings. Two concrete risks:

  1. Partial-leak context. If the known secret is abcdef123456 and a log line contains request_id=abcdef1234567890, the result becomes request_id=[REDACTED]7890, leaking both a 6-char prefix (which the rest of the system may consider identifying) and the trailing digits.
  2. False positives in hashes/paths. A 6-char-or-longer known value that happens to appear in a hex hash, URL path, or UUID fragment will be masked, producing garbled output ([REDACTED]7890-style artefacts) and degrading the diagnostic value of the logs without actually removing the secret.

Consider matching on word boundaries (re.sub(r'\\b' + re.escape(val) + r'\\b', PLACEHOLDER, text)) or, better, masking the whole surrounding token (e.g. \S* on either side) so partial matches either don't redact or redact the entire context that contains the secret.


Reply with @kilocode-bot fix it to have Kilo Code address this issue.


return text


def redact_lines(lines: "list[str]", known_values: "list[str] | None" = None) -> "list[str]":
"""Redact a list of log lines (convenience for the paged log reader)."""

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SUGGESTION: redact_lines will crash on None or non-string elements.

The list comprehension [redact(line, known_values) for line in lines] passes each element straight to redact, which does text = _PEM_RE.sub(...). re.Pattern.sub on None raises TypeError. Since the docstring advertises this as the entry point for the paged log reader, a single malformed line (e.g. from a buggy upstream parser emitting None) will take down the whole page response rather than producing a partially-redacted result. Consider coercing or guarding: for line in lines if isinstance(line, str).


Reply with @kilocode-bot fix it to have Kilo Code address this issue.

return [redact(line, known_values) for line in lines]
Loading