Pre-rollout guard. Source code stays clean — client identities come from CLAUDE_INGEST_CLIENT_BLOCK env (sourced from a SOPS-encrypted k8s secret in infra repo). Env value is a regex alternation; main wraps it with `(?i)\b(...)\b` so word-boundary matching avoids false hits inside longer identifiers (e.g. "Sebastian" doesn't trigger on "SEB"). DefaultRules (credential shapes) still take precedence so any leak that's BOTH a client mention AND a credential shape logs as the credential — strictly more dangerous, points triage at the right thing. Tests cover precedence + case variations + word-boundary respect + invalid-pattern rejection. Refs: infra#73 Track E.1 pre-rollout grill (option B). Bump-Type: minor
115 lines
4.5 KiB
Go
115 lines
4.5 KiB
Go
package claudewatcher
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"sync"
|
|
)
|
|
|
|
// Scrubber drops any turn whose content matches a known-bad pattern.
|
|
// Fail-closed by design: we'd rather lose signal than ingest credentials
|
|
// into a public-readable brain. The caller logs the drop reason.
|
|
//
|
|
// Rules cover the credential shapes most common to leak through Claude
|
|
// Code sessions: bearer tokens, postgres URIs with embedded auth, OAuth
|
|
// secret values, SOPS-encrypted secret blobs (we don't want the
|
|
// ciphertext either — it's a marker that the original message contained
|
|
// secret state), PEM-encoded private keys, and the explicit env-var
|
|
// naming conventions used in the homelab.
|
|
//
|
|
// Pattern philosophy: match by shape, not by content. A 40-char hex
|
|
// string in isolation is fine; the same string after `Authorization:
|
|
// Bearer ` is not. Tuned to catch known leak vectors from prior
|
|
// secret-hygiene incidents (POSTGRES_PASSWORD via kubectl exec env,
|
|
// INFRA_MCP_TOKEN via sops -d output) without dropping every Edit on a
|
|
// config file.
|
|
|
|
// Rule is a single named regex with a redact hint shown in the warn log.
|
|
type Rule struct {
|
|
Name string
|
|
RE *regexp.Regexp
|
|
}
|
|
|
|
// DefaultRules is the regex set applied by Scrub. Mutable for tests but
|
|
// callers should treat it as read-only at runtime.
|
|
var DefaultRules = []Rule{
|
|
// authorization-header is checked before the bare bearer rule so
|
|
// contextual hits ("Authorization: Bearer X") report the more
|
|
// specific match name in logs.
|
|
{Name: "authorization-header", RE: regexp.MustCompile(`(?i)Authorization\s*:\s*[A-Za-z]+\s+\S{8,}`)},
|
|
{Name: "bearer-token", RE: regexp.MustCompile(`(?i)Bearer\s+[A-Za-z0-9._\-]{16,}`)},
|
|
{Name: "postgres-uri-with-password", RE: regexp.MustCompile(`postgres(?:ql)?://[^:\s/]+:[^@\s/]+@`)},
|
|
{Name: "private-key", RE: regexp.MustCompile(`-----BEGIN[^-]*PRIVATE KEY-----`)},
|
|
{Name: "ssh-key", RE: regexp.MustCompile(`ssh-(?:rsa|ed25519|ecdsa)\s+[A-Za-z0-9+/=]{40,}`)},
|
|
{Name: "github-pat", RE: regexp.MustCompile(`\b(?:ghp|gho|ghu|ghr|gha)_[A-Za-z0-9]{30,}\b`)},
|
|
{Name: "openai-sk", RE: regexp.MustCompile(`\bsk-(?:proj-)?[A-Za-z0-9]{32,}\b`)},
|
|
{Name: "anthropic-sk", RE: regexp.MustCompile(`\bsk-ant-[A-Za-z0-9_\-]{32,}\b`)},
|
|
{Name: "aws-access-key", RE: regexp.MustCompile(`\bAKIA[0-9A-Z]{16}\b`)},
|
|
{Name: "homelab-env-token", RE: regexp.MustCompile(`(?i)(?:_TOKEN|_PASSWORD|_API_KEY|_SECRET)\s*[:=]\s*['"]?[A-Za-z0-9._/+\-]{12,}`)},
|
|
{Name: "sops-encrypted-marker", RE: regexp.MustCompile(`ENC\[AES256_GCM,data:[A-Za-z0-9+/=]{8,}`)},
|
|
}
|
|
|
|
// extraRules is appended to DefaultRules at process startup via
|
|
// RegisterRule. The mutex guards concurrent RegisterRule calls (rare)
|
|
// against concurrent Scrub reads (hot path). Scrub takes a read lock
|
|
// only when extraRules is non-empty, so steady-state cost is zero
|
|
// when no client-name guard is configured.
|
|
var (
|
|
extraRulesMu sync.RWMutex
|
|
extraRules []Rule
|
|
)
|
|
|
|
// RegisterRule appends a runtime-configured regex to the scrubber's
|
|
// rule set. Used by main to inject client-name guards from
|
|
// CLAUDE_INGEST_CLIENT_BLOCK env var (or equivalent SOPS-encrypted
|
|
// secret) without baking client identities into source code.
|
|
//
|
|
// pattern is compiled as-is — callers wrap with `\b...\b` and case
|
|
// flags as needed. Duplicate names are accepted (rules are positional);
|
|
// the second registration just fires after the first.
|
|
func RegisterRule(name, pattern string) error {
|
|
re, err := regexp.Compile(pattern)
|
|
if err != nil {
|
|
return fmt.Errorf("compile rule %q: %w", name, err)
|
|
}
|
|
extraRulesMu.Lock()
|
|
extraRules = append(extraRules, Rule{Name: name, RE: re})
|
|
extraRulesMu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
// ResetExtraRules clears every RegisterRule-added rule. Test-only.
|
|
func ResetExtraRules() {
|
|
extraRulesMu.Lock()
|
|
extraRules = nil
|
|
extraRulesMu.Unlock()
|
|
}
|
|
|
|
// Scrub reports the first matching rule, or empty when content is clean.
|
|
// Empty string is treated as clean. Caller decides what to do on a hit;
|
|
// the convention in claudewatcher is to drop the turn entirely and emit
|
|
// a slog.Warn naming the rule.
|
|
//
|
|
// Rule order: DefaultRules first (credential shapes), then runtime
|
|
// RegisterRule additions (client-name guards). Credential leaks
|
|
// outrank client-name hits in the log because they're strictly more
|
|
// dangerous.
|
|
func Scrub(content string) string {
|
|
if content == "" {
|
|
return ""
|
|
}
|
|
for _, r := range DefaultRules {
|
|
if r.RE.MatchString(content) {
|
|
return r.Name
|
|
}
|
|
}
|
|
extraRulesMu.RLock()
|
|
defer extraRulesMu.RUnlock()
|
|
for _, r := range extraRules {
|
|
if r.RE.MatchString(content) {
|
|
return r.Name
|
|
}
|
|
}
|
|
return ""
|
|
}
|