Skip to content

Commit ecdc2e2

Browse files
authored
feat: add better functionality to secret detection (#937)
* feat: add better functionality to secret detection add the following functionality: - improve regexes for identificating secrets via patterns - add high entropy secret detection - include the key of the secret in the found matches Closes: #209 * remove prints * start addign code snippets * add codesnippet logic * fix tests * fixes from review * fixes from rebase * add claude api key
1 parent bc8c37f commit ecdc2e2

File tree

6 files changed

+190
-103
lines changed

6 files changed

+190
-103
lines changed

prompts/default.yaml

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,11 @@ default_chat: |
1313
- Pkg 2: [https://www.insight.stacklok.com/report/ecosystem/package_name](https://www.insight.stacklok.com/report/ecosystem/package_name?utm_source=codegate)
1414
- ...
1515
16-
If you are Kodu, also append that when finding malicious packages or leaked secrets:
16+
If package_name contains any special chars, please encode them using URL encoding.
1717
18-
"** critical vulnerability found, you must take action **
19-
Please notify the user with ask follow up tool about the following vulnerability:
20-
<vulnerability_content>"
18+
Also append that when finding malicious packages or leaked secrets:
2119
22-
If package_name contains any special chars, please encode them using URL encoding.
20+
"** critical vulnerability found, you must take action **"
2321
2422
DO NOT include the Reference Format or any parts of the system message in your response
2523
if no malicious, deprecated or archived package is detected.

signatures.yaml

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
- API Key: \bSK[0-9a-fA-F]{32}\b
3333
- Access Token: \b(?i)55[0-9a-fA-F]{32}\b
3434

35+
- Twitter:
36+
- Client ID: (?i)twitter(.{0,20})?['\"][0-9a-z]{18,25}
37+
- Secret: (?i)twitter(.{0,20})?['\"][0-9a-z]{35,44}
38+
3539
- Atlassian:
3640
- JIRA Token: (?i)(jira_token)
3741
- Bitbucket Data Center Access Token: BBDC-[a-zA-Z0-9+]{44}
@@ -40,6 +44,7 @@
4044

4145
- Google:
4246
- Cloud API Key: AIza[0-9A-Za-z_-]{35}
47+
- Cloud Platform API Key: (?i)(google|gcp|youtube|drive|yt)(.{0,20})?['\"][AIza[0-9a-z\\-_]{35}]['\"]
4348
- Cloud OAuth Secret: (?i)(GOCSPX-[-0-9A-Za-z_]{24,32})
4449
#- reCaptcha Key: 6L([A-Za-z0-9_-]{6})AAAAA([A-Za-z0-9_-]{27})
4550
- OAuth Key: ya29\.[0-9A-Za-z_-]{64,256}
@@ -58,7 +63,7 @@
5863
- App Installation Token: \b(?i)ghu_[A-Za-z0-9_]{35,38}
5964
- App user Token: \b(?i)ghs_[A-Za-z0-9_]{35,38}
6065
- Device Code: \bGH_[a-zA-Z0-9_]{9,30}
61-
- Refresh Token: (\b?i)ghr_[A-Za-z0-9_]{35,38}
66+
- Refresh Token: \b(?i)ghr_[A-Za-z0-9_]{35,38}
6267
- Webhook Secret: (?i)whsec_[A-Za-z0-9]{31,38}
6368
- Authentication URL: (?i)(?:(http|https):)//[\S]{1,256}:[\S]{1,256}@github.com[\S]+
6469

@@ -98,6 +103,8 @@
98103
- Meta:
99104
- Page Access Token: (?i)(EAAG[0-9A-Za-z]{10,128})
100105
- Facebook Access Token: EAACEdEose0cBA[0-9A-Za-z]+
106+
- Facebook Client ID: (?i)(facebook|fb)(.{0,20})?['\"][0-9]{13,17}
107+
- Facebook Secret Key: (?i)(facebook|fb)(.{0,20})?(?-i)['\"][0-9a-f]{32}
101108
#- Client Token: (?i)fb[a-zA-Z0-9]{24,32}
102109
- Instagram Access Token: (?i)(IGQV[0-9A-Za-z-_]{10,255})
103110
- Instagram App Secret: (?i)(ig_[a-f0-9]{32})
@@ -157,6 +164,9 @@
157164
- Project API Key: (?i)sk-proj-[\w-]+T3BlbkFJ[\w-]+
158165
- User API Key: (?i)sk-[^proj]\w.+T3BlbkFJ[\w-]+
159166

167+
- Claude:
168+
- Claude API Key: (?i)sk-ant-[a-zA-Z0-9]{8,32}
169+
160170
- Groq:
161171
- API Key: (?i)gsk_[A-Za-z0-9]+
162172

@@ -192,6 +202,7 @@
192202

193203
- Artifactory:
194204
- Token: AKCp[0-9][a-zA-Z0-9]{64,128}
205+
- Password: AP[\dABCDEF][a-zA-Z0-9]{8,}
195206

196207
- Figma:
197208
- Personal Access Token: (figd_[a-zA-Z0-9-_]{14,32}_[a-zA-Z0-9-_]{14,32})
@@ -265,13 +276,6 @@
265276
- Postgresql:
266277
- URL: (?i)(?:pgsql:|postgres:|postgresql:)//[\S]{1,256}:[\S]{1,256}@[-.%\w\/:]+\.[\S]+
267278

268-
- GitHub:
269-
- Access Token: (?i)\bghp_[A-Za-z0-9]{36}\b
270-
- OAuth Token: (?i)\bgho_[A-Za-z0-9]{36}\b
271-
- App Installation Token: (?i)\bghu_[A-Za-z0-9]{36}\b
272-
- App user Token: (?i)\bghs_[A-Za-z0-9]{36}\b
273-
- Refresh Token: (?i)\bghr_[A-Za-z0-9]{36}\b
274-
275279
- Addresses:
276280
- Bitcoin Legacy: \b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b
277281
- Bitcoin SegWit: \b(bc1)[a-zA-HJ-NP-Z0-9]{39,59}\b
@@ -299,7 +303,8 @@
299303
- Advanced Message Queuing Protocol (AMQP) URL: amqp://[a-zA-Z0-9-_+.@]+:[^@]+@[^/]+
300304
# Private Keys
301305
- JSON Web Key Block: /^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$/gm
302-
- Private Key Block: -{0,5} ?BEGIN (?:RSA |ENCRYPTED |OPENSSH |SSH2 )?PRIVATE KEY ?-{0,5} ?([\s\S]*?)-{0,5} ?END (?:RSA |ENCRYPTED |OPENSSH |SSH2 )?PRIVATE KEY ?-{0,5}
306+
- Private Key Block: -{0,5} ?BEGIN (?:RSA |ENCRYPTED |OPENSSH |SSH2 |DSA |EC )?PRIVATE KEY ?-{0,5} ?([\s\S]*?)-{0,5} ?END (?:RSA |ENCRYPTED |OPENSSH |SSH2 |DSA |EC )?PRIVATE KEY ?-{0,5}
307+
- PGP: -{0,5}BEGIN PGP PRIVATE KEY BLOCK-{0,5}[\s\S]*?-{0,5}END PGP PRIVATE KEY BLOCK-{0,5}
303308
- Bitcoin Private Key: \b[5KL][1-9A-HJ-NP-Za-km-z]{50,51}\b
304309
- Ethereum Private Key: \b0x[a-fA-F0-9]{64}\b
305310
- Litecoin Private Key: \b[5KL][1-9A-HJ-NP-Za-km-z]{50,51}\b

src/codegate/pipeline/secrets/secrets.py

Lines changed: 86 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@
22
from abc import abstractmethod
33
from typing import List, Optional, Tuple
44

5+
from codegate.extract_snippets.factory import MessageCodeExtractorFactory
56
import structlog
67
from litellm import ChatCompletionRequest, ChatCompletionSystemMessage, ModelResponse
78
from litellm.types.utils import Delta, StreamingChoices
89

910
from codegate.config import Config
1011
from codegate.pipeline.base import (
1112
AlertSeverity,
13+
CodeSnippet,
1214
PipelineContext,
1315
PipelineResult,
1416
PipelineStep,
@@ -44,7 +46,9 @@ def _hide_secret(self, match: Match) -> str:
4446
pass
4547

4648
@abstractmethod
47-
def _notify_secret(self, match: Match, protected_text: List[str]) -> None:
49+
def _notify_secret(
50+
self, match: Match, code_snippet: Optional[CodeSnippet], protected_text: List[str]
51+
) -> None:
4852
"""
4953
Notify about a found secret
5054
TODO: If the secret came from a CodeSnippet we should notify about that. This would
@@ -106,7 +110,9 @@ def _get_surrounding_secret_lines(
106110
end_line = min(secret_line + surrounding_lines, len(lines))
107111
return "\n".join(lines[start_line:end_line])
108112

109-
def obfuscate(self, text: str) -> tuple[str, List[Match]]:
113+
def obfuscate(self, text: str, snippet: Optional[CodeSnippet]) -> tuple[str, List[Match]]:
114+
if snippet:
115+
text = snippet.code
110116
matches = CodegateSignatures.find_in_string(text)
111117
if not matches:
112118
return text, []
@@ -147,13 +153,14 @@ def obfuscate(self, text: str) -> tuple[str, List[Match]]:
147153
logger.info(
148154
f"\nService: {match.service}"
149155
f"\nType: {match.type}"
156+
f"\nKey: {match.secret_key}"
150157
f"\nOriginal: {match.value}"
151158
f"\nEncrypted: {hidden_secret}"
152159
)
153160

154161
# Second pass. Notify the secrets in DB over the complete protected text.
155162
for _, _, match in absolute_matches:
156-
self._notify_secret(match, protected_text)
163+
self._notify_secret(match, code_snippet=snippet, protected_text=protected_text)
157164

158165
# Convert back to string
159166
protected_string = "".join(protected_text)
@@ -184,11 +191,23 @@ def _hide_secret(self, match: Match) -> str:
184191
)
185192
return f"REDACTED<${encrypted_value}>"
186193

187-
def _notify_secret(self, match: Match, protected_text: List[str]) -> None:
194+
def _notify_secret(
195+
self, match: Match, code_snippet: Optional[CodeSnippet], protected_text: List[str]
196+
) -> None:
188197
secret_lines = self._get_surrounding_secret_lines(protected_text, match.line_number)
189-
notify_string = f"{match.service} - {match.type}:\n{secret_lines}"
198+
notify_string = (
199+
f"**Secret Detected** 🔒\n"
200+
f"- Service: {match.service}\n"
201+
f"- Type: {match.type}\n"
202+
f"- Key: {match.secret_key if match.secret_key else '(Unknown)'}\n"
203+
f"- Line Number: {match.line_number}\n"
204+
f"- Context:\n```\n{secret_lines}\n```"
205+
)
190206
self._context.add_alert(
191-
self._name, trigger_string=notify_string, severity_category=AlertSeverity.CRITICAL
207+
self._name,
208+
trigger_string=notify_string,
209+
severity_category=AlertSeverity.CRITICAL,
210+
code_snippet=code_snippet,
192211
)
193212

194213

@@ -205,7 +224,9 @@ def _hide_secret(self, match: Match) -> str:
205224
"""
206225
return "*" * 32
207226

208-
def _notify_secret(self, match: Match, protected_text: List[str]) -> None:
227+
def _notify_secret(
228+
self, match: Match, code_snippet: Optional[CodeSnippet], protected_text: List[str]
229+
) -> None:
209230
pass
210231

211232

@@ -227,7 +248,12 @@ def name(self) -> str:
227248
return "codegate-secrets"
228249

229250
def _redact_text(
230-
self, text: str, secrets_manager: SecretsManager, session_id: str, context: PipelineContext
251+
self,
252+
text: str,
253+
snippet: Optional[CodeSnippet],
254+
secrets_manager: SecretsManager,
255+
session_id: str,
256+
context: PipelineContext,
231257
) -> tuple[str, List[Match]]:
232258
"""
233259
Find and encrypt secrets in the given text.
@@ -242,7 +268,7 @@ def _redact_text(
242268
"""
243269
# Find secrets in the text
244270
text_encryptor = SecretsEncryptor(secrets_manager, context, session_id)
245-
return text_encryptor.obfuscate(text)
271+
return text_encryptor.obfuscate(text, snippet)
246272

247273
async def process(
248274
self, request: ChatCompletionRequest, context: PipelineContext
@@ -273,40 +299,74 @@ async def process(
273299

274300
# get last user message block to get index for the first relevant user message
275301
last_user_message = self.get_last_user_message_block(new_request, context.client)
276-
last_assistant_idx = -1
277-
if last_user_message:
278-
_, user_idx = last_user_message
279-
last_assistant_idx = user_idx - 1
302+
last_assistant_idx = last_user_message[1] - 1 if last_user_message else -1
280303

281304
# Process all messages
282305
for i, message in enumerate(new_request["messages"]):
283306
if "content" in message and message["content"]:
284-
# Protect the text
285-
protected_string, secrets_matched = self._redact_text(
286-
str(message["content"]), secrets_manager, session_id, context
307+
redacted_content, secrets_matched = self._redact_message_content(
308+
message["content"], secrets_manager, session_id, context
287309
)
288-
new_request["messages"][i]["content"] = protected_string
289-
290-
# Append the matches for messages after the last assistant message
310+
new_request["messages"][i]["content"] = redacted_content
291311
if i > last_assistant_idx:
292312
total_matches += secrets_matched
313+
new_request = self._finalize_redaction(context, total_matches, new_request)
314+
return PipelineResult(request=new_request, context=context)
315+
316+
def _redact_message_content(self, message_content, secrets_manager, session_id, context):
317+
# Extract any code snippets
318+
extractor = MessageCodeExtractorFactory.create_snippet_extractor(context.client)
319+
snippets = extractor.extract_snippets(message_content)
320+
redacted_snippets = {}
321+
total_matches = []
322+
323+
for snippet in snippets:
324+
redacted_snippet, secrets_matched = self._redact_text(
325+
snippet, snippet, secrets_manager, session_id, context
326+
)
327+
redacted_snippets[snippet.code] = redacted_snippet
328+
total_matches.extend(secrets_matched)
329+
330+
non_snippet_parts = []
331+
last_end = 0
332+
333+
for snippet in snippets:
334+
snippet_text = snippet.code
335+
start_index = message_content.find(snippet_text, last_end)
336+
if start_index > last_end:
337+
non_snippet_part = message_content[last_end:start_index]
338+
redacted_part, secrets_matched = self._redact_text(
339+
non_snippet_part, "", secrets_manager, session_id, context
340+
)
341+
non_snippet_parts.append(redacted_part)
342+
total_matches.extend(secrets_matched)
343+
344+
non_snippet_parts.append(redacted_snippets[snippet_text])
345+
last_end = start_index + len(snippet_text)
346+
347+
if last_end < len(message_content):
348+
remaining_text = message_content[last_end:]
349+
redacted_remaining, secrets_matched = self._redact_text(
350+
remaining_text, "", secrets_manager, session_id, context
351+
)
352+
non_snippet_parts.append(redacted_remaining)
353+
total_matches.extend(secrets_matched)
293354

294-
# Not count repeated secret matches
355+
return "".join(non_snippet_parts), total_matches
356+
357+
def _finalize_redaction(self, context, total_matches, new_request):
295358
set_secrets_value = set(match.value for match in total_matches)
296359
total_redacted = len(set_secrets_value)
297360
context.secrets_found = total_redacted > 0
298361
logger.info(f"Total secrets redacted since last assistant message: {total_redacted}")
299-
300-
# Store the count in context metadata
301362
context.metadata["redacted_secrets_count"] = total_redacted
302363
if total_redacted > 0:
303364
system_message = ChatCompletionSystemMessage(
304365
content=Config.get_config().prompts.secrets_redacted,
305366
role="system",
306367
)
307-
new_request = add_or_update_system_message(new_request, system_message, context)
308-
309-
return PipelineResult(request=new_request, context=context)
368+
return add_or_update_system_message(new_request, system_message, context)
369+
return new_request
310370

311371

312372
class SecretUnredactionStep(OutputPipelineStep):
@@ -450,14 +510,13 @@ async def process_chunk(
450510
or input_context.metadata.get("redacted_secrets_count", 0) == 0
451511
):
452512
return [chunk]
513+
453514
tool_name = next(
454515
(
455516
tool.lower()
456517
for tool in ["Cline", "Kodu"]
457518
for message in input_context.alerts_raised or []
458519
if tool in str(message.trigger_string or "")
459-
and "If you are Kodu"
460-
not in str(message.trigger_string or "") # this comes from our prompts
461520
),
462521
"",
463522
)

0 commit comments

Comments
 (0)