feat: implement indirect prompt injection protection and expanded secret redaction

This commit is contained in:
Steven Choo
2026-04-27 12:42:20 +02:00
parent 909fec9abb
commit 15c0c8a45d
4 changed files with 31 additions and 0 deletions
+6
View File
@@ -263,6 +263,12 @@ function shouldAcceptWorkspaceMemoryCandidate(
if (/^(function|class|interface|type|const|let|var)\s+\w+/.test(text)) return false;
if (/^(GET|POST|PUT|DELETE|PATCH)\s+\//.test(text)) return false;
// Indirect Prompt Injection / Adversarial Instructions
// Rejects attempts to overwrite system behavior or "ignore" rules.
// comparative "instead of" is allowed.
if (/\b(ignore\s+all|ignore\s+previous|ignore\s+instruction|overwrite\s+system|overwrite\s+rules|forget\s+all|delete\s+root)\b/i.test(text)) return false;
if (/\b(ignore|instruction|overwrite)\b/i.test(text) && /\b(previous|all|rules|behavior|prompt|system)\b/i.test(text)) return false;
// Path-heavy facts (rediscoverable from repo)
const pathCount = (text.match(/\/[\w.-]+(\/[\w.-]+)+/g) || []).length;
if (pathCount > 2) return false;
+8
View File
@@ -11,10 +11,12 @@ const SECRET_VALUE = String.raw`[^` + "`" + String.raw`'",,\s\[]+`;
const PASSWORD_LABELS = /password|passwd|pwd|密碼|密码|パスワード|비밀번호|contraseña|mot de passe|passwort/i;
const USERNAME_LABELS = /username|user name|用戶名|用户名|ユーザー名|사용자명|usuario|utilisateur|benutzer/i;
const SENSITIVE_LABELS = /api[_-]?key|token|bearer|secret|credential|auth|auth[_-]?key|private[_-]?key/i;
const PIN_PREFIX = String.raw`(\bPIN\b(?:\s*(?:是|=|:|)\s*|\s+(?![是=:])))`;
const PASSWORD_PREFIX = String.raw`((?:${PASSWORD_LABELS.source})(?:\s*(?:是|=|:|)\s*|\s+(?![是=:])))`;
const USERNAME_PREFIX = String.raw`((?:${USERNAME_LABELS.source})(?:\s*(?:是|=|:|)\s*|\s+(?![是=:])))`;
const SENSITIVE_PREFIX = String.raw`((?:${SENSITIVE_LABELS.source})(?:\s*(?:推|是|=|:|)\s*|[:]\s*))`;
export type MemoryConsolidationReason =
| "promoted"
@@ -231,6 +233,12 @@ export function redactCredentials(text: string): string {
"$1[REDACTED]",
);
// 4. Standalone sensitive keys/tokens
result = result.replace(
new RegExp(String.raw`${SENSITIVE_PREFIX}[\`'"]?(${SECRET_VALUE})`, "gi"),
"$1[REDACTED]",
);
return result;
}