Files
hyperguild/ingestion/internal/claudewatcher/parser_test.go
Mathias bc011cc1f0 feat(claudewatcher): ingest Claude Code session transcripts into brain
New package internal/claudewatcher. The volume gate (24 turns/week of
agentsquad logs vs 500/week gate) exposed that the real signal lives
in daily Claude Code usage at ~/.claude/projects/*/<uuid>.jsonl, not
in agentsquad output. This package captures that signal. See infra#73
Track E + hyperguild#27 for the full reframe.

Components:
- parser: tolerant JSONL parser over the observed Claude Code session
  schema (user / assistant / attachment / system + bookkeeping types).
  Skip-flag fast-paths queue-operation, last-prompt, permission-mode,
  ai-title, bridge-session, file-history-snapshot.
- scrubber: 11-rule fail-closed regex set for credential shapes
  (bearer, postgres URIs, PEM, ssh-key, ghp_/sk-/sk-ant-/AKIA, homelab
  env tokens, SOPS markers). Drop turn + log on match.
- cursor: postgres-backed claude_session_cursors table, keyed by
  (host, file_path) with byte_offset. Resumable across pod restarts.
- watcher: poll loop. Walks SessionsDir, processes each .jsonl from
  its cursor offset, runs scrubber, emits a Batch per file to a
  Sink interface, advances cursor on successful Ingest.

No classifier integration in this commit — every kept turn is emitted
in a per-session batch. The cmd/server wiring (next commit) routes
batches to brain/wiki/claude-sessions/facts/. Classifier-driven hall
routing (decisions / failures / hypotheses) is a follow-up.

19 unit tests across parser + scrubber + watcher. task check green.

Refs: infra#73, hyperguild#27
2026-05-25 19:58:58 +02:00

158 lines
5.1 KiB
Go

package claudewatcher
import (
"errors"
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func collect(t *testing.T, body string) ([]Turn, int64, error) {
t.Helper()
var out []Turn
end, err := ParseStream(strings.NewReader(body), 0, nil, func(tr Turn) error {
out = append(out, tr)
return nil
})
return out, end, err
}
func TestParseStream_UserTurnStringContent(t *testing.T) {
body := `{"type":"user","sessionId":"S","timestamp":"2026-05-25T07:00:00Z","message":"hello world"}
`
turns, end, err := collect(t, body)
require.NoError(t, err)
require.Len(t, turns, 1)
assert.Equal(t, "user", turns[0].Type)
assert.Equal(t, "S", turns[0].SessionID)
assert.Equal(t, "hello world", turns[0].Content)
assert.False(t, turns[0].Skip)
assert.Equal(t, int64(len(body)), end)
}
func TestParseStream_UserTurnContentBlocks(t *testing.T) {
body := `{"type":"user","sessionId":"S","timestamp":"2026-05-25T07:00:00Z","message":{"role":"user","content":[{"type":"text","text":"line 1"},{"type":"text","text":"line 2"}]}}
`
turns, _, err := collect(t, body)
require.NoError(t, err)
require.Len(t, turns, 1)
assert.Equal(t, "line 1\nline 2", turns[0].Content)
}
func TestParseStream_AssistantToolUse(t *testing.T) {
body := `{"type":"assistant","sessionId":"S","timestamp":"2026-05-25T07:00:00Z","message":{"content":[{"type":"text","text":"calling now"},{"type":"tool_use","name":"Edit","input":{}}]}}
`
turns, _, err := collect(t, body)
require.NoError(t, err)
require.Len(t, turns, 1)
assert.Equal(t, "Edit", turns[0].ToolName)
assert.Contains(t, turns[0].Content, "calling now")
assert.Contains(t, turns[0].Content, "[tool_use:Edit]")
}
func TestParseStream_AssistantToolResult(t *testing.T) {
body := `{"type":"user","sessionId":"S","timestamp":"2026-05-25T07:00:00Z","message":{"content":[{"type":"tool_result","content":"output of cmd"}]}}
`
turns, _, err := collect(t, body)
require.NoError(t, err)
require.Len(t, turns, 1)
assert.Contains(t, turns[0].Content, "[tool_result] output of cmd")
}
func TestParseStream_SkipsBookkeepingTypes(t *testing.T) {
body := strings.Join([]string{
`{"type":"queue-operation","sessionId":"S","content":"x"}`,
`{"type":"last-prompt","sessionId":"S","lastPrompt":"y"}`,
`{"type":"permission-mode","sessionId":"S","permissionMode":"auto"}`,
`{"type":"ai-title","sessionId":"S","aiTitle":"My session"}`,
`{"type":"file-history-snapshot","messageId":"abc"}`,
}, "\n") + "\n"
turns, _, err := collect(t, body)
require.NoError(t, err)
require.Len(t, turns, 5)
for _, tr := range turns {
assert.True(t, tr.Skip, "expected Skip=true for %q", tr.Type)
}
}
func TestParseStream_UnknownTypeIsSkip(t *testing.T) {
body := `{"type":"future-thing","sessionId":"S"}` + "\n"
turns, _, err := collect(t, body)
require.NoError(t, err)
require.Len(t, turns, 1)
assert.True(t, turns[0].Skip)
}
func TestParseStream_MalformedLineIsSkippedNotFatal(t *testing.T) {
body := strings.Join([]string{
`{"type":"user","sessionId":"S","message":"first"}`,
`{not valid json`,
`{"type":"user","sessionId":"S","message":"third"}`,
}, "\n") + "\n"
var warnings int
var turns []Turn
_, err := ParseStream(strings.NewReader(body), 0, func(format string, args ...any) {
warnings++
}, func(tr Turn) error {
turns = append(turns, tr)
return nil
})
require.NoError(t, err)
require.Len(t, turns, 2, "first + third should make it through")
assert.Equal(t, 1, warnings)
}
func TestParseStream_EmitErrStopHaltsCleanly(t *testing.T) {
body := strings.Join([]string{
`{"type":"user","sessionId":"S","message":"a"}`,
`{"type":"user","sessionId":"S","message":"b"}`,
`{"type":"user","sessionId":"S","message":"c"}`,
}, "\n") + "\n"
count := 0
end, err := ParseStream(strings.NewReader(body), 0, nil, func(tr Turn) error {
count++
if count == 2 {
return ErrStop
}
return nil
})
require.NoError(t, err)
assert.Equal(t, 2, count)
assert.Greater(t, end, int64(0))
}
func TestParseStream_EmitOtherErrorPropagates(t *testing.T) {
body := `{"type":"user","sessionId":"S","message":"a"}` + "\n"
want := errors.New("boom")
_, err := ParseStream(strings.NewReader(body), 0, nil, func(tr Turn) error {
return want
})
require.Error(t, err)
assert.Contains(t, err.Error(), "boom")
}
func TestParseStream_AttachmentHookEvent(t *testing.T) {
body := `{"type":"attachment","sessionId":"S","timestamp":"2026-05-25T07:00:00Z","attachment":{"type":"hook_success","hookName":"SessionStart:startup","hookEvent":"SessionStart","content":"hook body"}}
`
turns, _, err := collect(t, body)
require.NoError(t, err)
require.Len(t, turns, 1)
assert.Equal(t, "hook body", turns[0].Content)
}
func TestParseStream_OffsetAdvances(t *testing.T) {
body := `{"type":"user","sessionId":"S","message":"a"}` + "\n" +
`{"type":"user","sessionId":"S","message":"b"}` + "\n"
var offsets []int64
_, err := ParseStream(strings.NewReader(body), 100, nil, func(tr Turn) error {
offsets = append(offsets, tr.OffsetAfter)
return nil
})
require.NoError(t, err)
require.Len(t, offsets, 2)
assert.Greater(t, offsets[0], int64(100))
assert.Greater(t, offsets[1], offsets[0])
}