package vectorstore_test import ( "context" "errors" "os" "path/filepath" "strings" "testing" "time" "github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) type stubStore struct { // known maps chunk-path → updated_at. Tests that don't care about // re-embed-on-mtime use a far-future time so the Sync skip path // always wins. Tests that do exercise the mtime path set the // updated_at explicitly. known map[string]time.Time upserts map[string][]float32 deletes []string failNext error } // farFuture is "newer than any file mtime", used as the default // updated_at in stubs that don't care about re-embed behavior. var farFuture = time.Now().Add(24 * time.Hour) func (s *stubStore) KnownPathsWithTime(_ context.Context) (map[string]time.Time, error) { out := make(map[string]time.Time, len(s.known)) for k, t := range s.known { if t.IsZero() { t = farFuture } out[k] = t } return out, nil } func (s *stubStore) Upsert(_ context.Context, path string, v []float32) error { if s.failNext != nil { err := s.failNext s.failNext = nil return err } if s.upserts == nil { s.upserts = make(map[string][]float32) } s.upserts[path] = v return nil } func (s *stubStore) Delete(_ context.Context, path string) error { s.deletes = append(s.deletes, path) return nil } type stubEmbedder struct { vec []float32 err error } func (e stubEmbedder) Embed(_ context.Context, _ string) ([]float32, error) { return e.vec, e.err } func writeNote(t *testing.T, dir, rel, body string) { t.Helper() full := filepath.Join(dir, rel) require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755)) require.NoError(t, os.WriteFile(full, []byte(body), 0o644)) } func TestSync_AddsNewFiles(t *testing.T) { dir := t.TempDir() writeNote(t, dir, "wiki/jepa-fx/facts/x.md", "body of x") writeNote(t, dir, "wiki/jepa-fx/facts/y.md", "body of y") store := &stubStore{known: map[string]time.Time{}} emb := stubEmbedder{vec: make([]float32, 768)} res, err := vectorstore.Sync(context.Background(), dir, store, emb) require.NoError(t, err) assert.Equal(t, 2, res.Added) assert.Empty(t, res.Deleted) assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/x.md#0001") assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/y.md#0001") } func TestSync_SkipsAlreadyKnown(t *testing.T) { dir := t.TempDir() writeNote(t, dir, "wiki/a/facts/x.md", "x") store := &stubStore{known: map[string]time.Time{"wiki/a/facts/x.md#0001": {}}} emb := stubEmbedder{vec: make([]float32, 768)} res, err := vectorstore.Sync(context.Background(), dir, store, emb) require.NoError(t, err) assert.Equal(t, 0, res.Added) assert.Empty(t, store.upserts) } func TestSync_DeletesDisappearedFiles(t *testing.T) { dir := t.TempDir() require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755)) // store has a path that doesn't exist on disk anymore store := &stubStore{known: map[string]time.Time{"wiki/old/facts/ghost.md#0001": {}}} res, err := vectorstore.Sync(context.Background(), dir, &stubStoreWithDelete{stubStore: store}, stubEmbedder{vec: make([]float32, 768)}) require.NoError(t, err) assert.Equal(t, 1, res.Deleted) } // stubStoreWithDelete is a thin wrapper to capture Delete calls; // stubStore already implements Delete but we need the wrapper to mix // store interfaces with sync-specific expectations. type stubStoreWithDelete struct { *stubStore } func TestSync_SkipsIndexFiles(t *testing.T) { dir := t.TempDir() writeNote(t, dir, "wiki/a/_index.md", "moc") writeNote(t, dir, "wiki/a/facts/real.md", "body") store := &stubStore{known: map[string]time.Time{}} res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)}) require.NoError(t, err) assert.Equal(t, 1, res.Added) assert.NotContains(t, store.upserts, "wiki/a/_index.md#0001") } func TestSync_ScansKnowledgeDir(t *testing.T) { dir := t.TempDir() writeNote(t, dir, "wiki/a/facts/x.md", "x") writeNote(t, dir, "knowledge/2026-05-19-koala-gpu-setup.md", "knowledge body") store := &stubStore{known: map[string]time.Time{}} emb := stubEmbedder{vec: make([]float32, 768)} res, err := vectorstore.Sync(context.Background(), dir, store, emb) require.NoError(t, err) assert.Equal(t, 2, res.Added) assert.Contains(t, store.upserts, "wiki/a/facts/x.md#0001") assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md#0001") } func TestSync_ChunksLongFiles(t *testing.T) { dir := t.TempDir() // Build a file that's well over the chunk byte budget. Multi-section // markdown so the chunker has heading boundaries to cut on. body := "# Doc\n\nintro line.\n\n" for i := 0; i < 10; i++ { body += "## Section " + string(rune('A'+i)) + "\n\n" body += strings.Repeat("This section has a fair amount of content. ", 50) + "\n\n" } writeNote(t, dir, "knowledge/long.md", body) store := &stubStore{known: map[string]time.Time{}} emb := stubEmbedder{vec: make([]float32, 768)} res, err := vectorstore.Sync(context.Background(), dir, store, emb) require.NoError(t, err) assert.Greater(t, res.Added, 1, "long file should produce multiple chunk rows") // Every upserted path for this file must be a chunk path. chunkCount := 0 for p := range store.upserts { if strings.HasPrefix(p, "knowledge/long.md#") { chunkCount++ } } assert.Equal(t, res.Added, chunkCount, "all rows for long file should be chunk-suffixed") // The bare parent path must NOT be upserted directly. assert.NotContains(t, store.upserts, "knowledge/long.md") } func TestSync_ShortFileGetsSingleChunkRow(t *testing.T) { dir := t.TempDir() writeNote(t, dir, "wiki/short.md", "tiny body\n") store := &stubStore{known: map[string]time.Time{}} emb := stubEmbedder{vec: make([]float32, 768)} res, err := vectorstore.Sync(context.Background(), dir, store, emb) require.NoError(t, err) assert.Equal(t, 1, res.Added) assert.Contains(t, store.upserts, "wiki/short.md#0001") } func TestSync_SkipsFileIfAnyChunkAlreadyKnown(t *testing.T) { dir := t.TempDir() writeNote(t, dir, "wiki/foo.md", "body\n") store := &stubStore{known: map[string]time.Time{ "wiki/foo.md#0001": {}, }} emb := stubEmbedder{vec: make([]float32, 768)} res, err := vectorstore.Sync(context.Background(), dir, store, emb) require.NoError(t, err) assert.Equal(t, 0, res.Added) assert.Empty(t, store.upserts) } func TestSync_DeletesAllChunksOfDisappearedFile(t *testing.T) { dir := t.TempDir() require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755)) store := &stubStore{known: map[string]time.Time{ "wiki/ghost.md#0001": {}, "wiki/ghost.md#0002": {}, "wiki/ghost.md#0003": {}, }} res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)}) require.NoError(t, err) assert.Equal(t, 3, res.Deleted) } func TestSync_ReembedsFileWhenMtimeNewer(t *testing.T) { dir := t.TempDir() writeNote(t, dir, "wiki/edited.md", "original body\n") // Force the file's mtime ahead of any plausible store updated_at. future := time.Now().Add(1 * time.Hour) require.NoError(t, os.Chtimes(filepath.Join(dir, "wiki/edited.md"), future, future)) store := &stubStore{ known: map[string]time.Time{ // Existing chunk row pre-dates the file's mtime. "wiki/edited.md#0001": time.Now().Add(-1 * time.Hour), }, } emb := stubEmbedder{vec: make([]float32, 768)} res, err := vectorstore.Sync(context.Background(), dir, store, emb) require.NoError(t, err) assert.Equal(t, 1, res.Added, "file with newer mtime should be re-embedded") assert.Contains(t, store.upserts, "wiki/edited.md#0001") // Old chunks of the same parent must be deleted before re-embed so // shrunk files don't leave orphan rows at higher #NNNN indexes. assert.Contains(t, store.deletes, "wiki/edited.md#0001") } func TestSync_SkipsFileWhenMtimeOlder(t *testing.T) { dir := t.TempDir() writeNote(t, dir, "wiki/stable.md", "body\n") // Backdate mtime to before the store's recorded updated_at. past := time.Now().Add(-2 * time.Hour) require.NoError(t, os.Chtimes(filepath.Join(dir, "wiki/stable.md"), past, past)) store := &stubStore{ known: map[string]time.Time{ "wiki/stable.md#0001": time.Now(), }, } emb := stubEmbedder{vec: make([]float32, 768)} res, err := vectorstore.Sync(context.Background(), dir, store, emb) require.NoError(t, err) assert.Equal(t, 0, res.Added) assert.Empty(t, store.upserts) assert.Empty(t, store.deletes) } func TestSync_NoOpWhenComponentsNil(t *testing.T) { dir := t.TempDir() writeNote(t, dir, "wiki/a/facts/x.md", "x") res, err := vectorstore.Sync(context.Background(), dir, nil, nil) require.NoError(t, err) assert.Equal(t, 0, res.Added) } func TestSync_CollectsEmbedderErrors(t *testing.T) { dir := t.TempDir() writeNote(t, dir, "wiki/a/facts/x.md", "x") store := &stubStore{known: map[string]time.Time{}} emb := stubEmbedder{err: errors.New("upstream down")} res, err := vectorstore.Sync(context.Background(), dir, store, emb) require.NoError(t, err) assert.Equal(t, 0, res.Added) assert.Len(t, res.Errors, 1) }