New package internal/claudewatcher. The volume gate (24 turns/week of agentsquad logs vs 500/week gate) exposed that the real signal lives in daily Claude Code usage at ~/.claude/projects/*/<uuid>.jsonl, not in agentsquad output. This package captures that signal. See infra#73 Track E + hyperguild#27 for the full reframe. Components: - parser: tolerant JSONL parser over the observed Claude Code session schema (user / assistant / attachment / system + bookkeeping types). Skip-flag fast-paths queue-operation, last-prompt, permission-mode, ai-title, bridge-session, file-history-snapshot. - scrubber: 11-rule fail-closed regex set for credential shapes (bearer, postgres URIs, PEM, ssh-key, ghp_/sk-/sk-ant-/AKIA, homelab env tokens, SOPS markers). Drop turn + log on match. - cursor: postgres-backed claude_session_cursors table, keyed by (host, file_path) with byte_offset. Resumable across pod restarts. - watcher: poll loop. Walks SessionsDir, processes each .jsonl from its cursor offset, runs scrubber, emits a Batch per file to a Sink interface, advances cursor on successful Ingest. No classifier integration in this commit — every kept turn is emitted in a per-session batch. The cmd/server wiring (next commit) routes batches to brain/wiki/claude-sessions/facts/. Classifier-driven hall routing (decisions / failures / hypotheses) is a follow-up. 19 unit tests across parser + scrubber + watcher. task check green. Refs: infra#73, hyperguild#27
63 lines
2.9 KiB
Go
63 lines
2.9 KiB
Go
package claudewatcher
|
|
|
|
import "regexp"
|
|
|
|
// Scrubber drops any turn whose content matches a known-bad pattern.
|
|
// Fail-closed by design: we'd rather lose signal than ingest credentials
|
|
// into a public-readable brain. The caller logs the drop reason.
|
|
//
|
|
// Rules cover the credential shapes most common to leak through Claude
|
|
// Code sessions: bearer tokens, postgres URIs with embedded auth, OAuth
|
|
// secret values, SOPS-encrypted secret blobs (we don't want the
|
|
// ciphertext either — it's a marker that the original message contained
|
|
// secret state), PEM-encoded private keys, and the explicit env-var
|
|
// naming conventions used in the homelab.
|
|
//
|
|
// Pattern philosophy: match by shape, not by content. A 40-char hex
|
|
// string in isolation is fine; the same string after `Authorization:
|
|
// Bearer ` is not. Tuned to catch known leak vectors from prior
|
|
// secret-hygiene incidents (POSTGRES_PASSWORD via kubectl exec env,
|
|
// INFRA_MCP_TOKEN via sops -d output) without dropping every Edit on a
|
|
// config file.
|
|
|
|
// Rule is a single named regex with a redact hint shown in the warn log.
|
|
type Rule struct {
|
|
Name string
|
|
RE *regexp.Regexp
|
|
}
|
|
|
|
// DefaultRules is the regex set applied by Scrub. Mutable for tests but
|
|
// callers should treat it as read-only at runtime.
|
|
var DefaultRules = []Rule{
|
|
// authorization-header is checked before the bare bearer rule so
|
|
// contextual hits ("Authorization: Bearer X") report the more
|
|
// specific match name in logs.
|
|
{Name: "authorization-header", RE: regexp.MustCompile(`(?i)Authorization\s*:\s*[A-Za-z]+\s+\S{8,}`)},
|
|
{Name: "bearer-token", RE: regexp.MustCompile(`(?i)Bearer\s+[A-Za-z0-9._\-]{16,}`)},
|
|
{Name: "postgres-uri-with-password", RE: regexp.MustCompile(`postgres(?:ql)?://[^:\s/]+:[^@\s/]+@`)},
|
|
{Name: "private-key", RE: regexp.MustCompile(`-----BEGIN[^-]*PRIVATE KEY-----`)},
|
|
{Name: "ssh-key", RE: regexp.MustCompile(`ssh-(?:rsa|ed25519|ecdsa)\s+[A-Za-z0-9+/=]{40,}`)},
|
|
{Name: "github-pat", RE: regexp.MustCompile(`\b(?:ghp|gho|ghu|ghr|gha)_[A-Za-z0-9]{30,}\b`)},
|
|
{Name: "openai-sk", RE: regexp.MustCompile(`\bsk-(?:proj-)?[A-Za-z0-9]{32,}\b`)},
|
|
{Name: "anthropic-sk", RE: regexp.MustCompile(`\bsk-ant-[A-Za-z0-9_\-]{32,}\b`)},
|
|
{Name: "aws-access-key", RE: regexp.MustCompile(`\bAKIA[0-9A-Z]{16}\b`)},
|
|
{Name: "homelab-env-token", RE: regexp.MustCompile(`(?i)(?:_TOKEN|_PASSWORD|_API_KEY|_SECRET)\s*[:=]\s*['"]?[A-Za-z0-9._/+\-]{12,}`)},
|
|
{Name: "sops-encrypted-marker", RE: regexp.MustCompile(`ENC\[AES256_GCM,data:[A-Za-z0-9+/=]{8,}`)},
|
|
}
|
|
|
|
// Scrub reports the first matching rule, or empty when content is clean.
|
|
// Empty string is treated as clean. Caller decides what to do on a hit;
|
|
// the convention in claudewatcher is to drop the turn entirely and emit
|
|
// a slog.Warn naming the rule.
|
|
func Scrub(content string) string {
|
|
if content == "" {
|
|
return ""
|
|
}
|
|
for _, r := range DefaultRules {
|
|
if r.RE.MatchString(content) {
|
|
return r.Name
|
|
}
|
|
}
|
|
return ""
|
|
}
|