The embed sync goroutine only walked brain/wiki/. brain/knowledge/ (112 curated entries, per CLAUDE.md the most-important brain content) had zero coverage in brain_embeddings — vector retrieval was blind to it. Hybrid BM25 + pgvector retrieval would never surface a curated knowledge entry via the vector arm. Extract the per-root walk into a loop over a small subdir list and add "knowledge" alongside "wiki". scanDirs is package-level so it stays a single source of truth for what gets embedded. Also log each failing item's path + error string from StartSync. Previously only the aggregate count was logged, so a persistent `errors=1` per cycle was opaque. With per-item warnings, the actual ollama "input length exceeds the context length" surface immediately. Refs gitea/mathias/infra#37 (this commit covers the knowledge/ scan bug; the long-file chunking bug is a separate change.)
152 lines
4.5 KiB
Go
152 lines
4.5 KiB
Go
package vectorstore_test
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
|
|
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
type stubStore struct {
|
|
known map[string]struct{}
|
|
upserts map[string][]float32
|
|
deletes []string
|
|
failNext error
|
|
}
|
|
|
|
func (s *stubStore) KnownPaths(_ context.Context) (map[string]struct{}, error) {
|
|
out := make(map[string]struct{}, len(s.known))
|
|
for k := range s.known {
|
|
out[k] = struct{}{}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (s *stubStore) Upsert(_ context.Context, path string, v []float32) error {
|
|
if s.failNext != nil {
|
|
err := s.failNext
|
|
s.failNext = nil
|
|
return err
|
|
}
|
|
if s.upserts == nil {
|
|
s.upserts = make(map[string][]float32)
|
|
}
|
|
s.upserts[path] = v
|
|
return nil
|
|
}
|
|
|
|
func (s *stubStore) Delete(_ context.Context, path string) error {
|
|
s.deletes = append(s.deletes, path)
|
|
return nil
|
|
}
|
|
|
|
type stubEmbedder struct {
|
|
vec []float32
|
|
err error
|
|
}
|
|
|
|
func (e stubEmbedder) Embed(_ context.Context, _ string) ([]float32, error) {
|
|
return e.vec, e.err
|
|
}
|
|
|
|
func writeNote(t *testing.T, dir, rel, body string) {
|
|
t.Helper()
|
|
full := filepath.Join(dir, rel)
|
|
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
|
|
require.NoError(t, os.WriteFile(full, []byte(body), 0o644))
|
|
}
|
|
|
|
func TestSync_AddsNewFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/jepa-fx/facts/x.md", "body of x")
|
|
writeNote(t, dir, "wiki/jepa-fx/facts/y.md", "body of y")
|
|
|
|
store := &stubStore{known: map[string]struct{}{}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 2, res.Added)
|
|
assert.Empty(t, res.Deleted)
|
|
assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/x.md")
|
|
assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/y.md")
|
|
}
|
|
|
|
func TestSync_SkipsAlreadyKnown(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
|
|
store := &stubStore{known: map[string]struct{}{"wiki/a/facts/x.md": {}}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
assert.Empty(t, store.upserts)
|
|
}
|
|
|
|
func TestSync_DeletesDisappearedFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
|
// store has a path that doesn't exist on disk anymore
|
|
store := &stubStore{known: map[string]struct{}{"wiki/old/facts/ghost.md": {}}}
|
|
res, err := vectorstore.Sync(context.Background(), dir, &stubStoreWithDelete{stubStore: store}, stubEmbedder{vec: make([]float32, 768)})
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 1, res.Deleted)
|
|
}
|
|
|
|
// stubStoreWithDelete is a thin wrapper to capture Delete calls;
|
|
// stubStore already implements Delete but we need the wrapper to mix
|
|
// store interfaces with sync-specific expectations.
|
|
type stubStoreWithDelete struct {
|
|
*stubStore
|
|
}
|
|
|
|
func TestSync_SkipsIndexFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/_index.md", "moc")
|
|
writeNote(t, dir, "wiki/a/facts/real.md", "body")
|
|
|
|
store := &stubStore{known: map[string]struct{}{}}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 1, res.Added)
|
|
assert.NotContains(t, store.upserts, "wiki/a/_index.md")
|
|
}
|
|
|
|
func TestSync_ScansKnowledgeDir(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
writeNote(t, dir, "knowledge/2026-05-19-koala-gpu-setup.md", "knowledge body")
|
|
|
|
store := &stubStore{known: map[string]struct{}{}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 2, res.Added)
|
|
assert.Contains(t, store.upserts, "wiki/a/facts/x.md")
|
|
assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md")
|
|
}
|
|
|
|
func TestSync_NoOpWhenComponentsNil(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
res, err := vectorstore.Sync(context.Background(), dir, nil, nil)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
}
|
|
|
|
func TestSync_CollectsEmbedderErrors(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
store := &stubStore{known: map[string]struct{}{}}
|
|
emb := stubEmbedder{err: errors.New("upstream down")}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
assert.Len(t, res.Errors, 1)
|
|
}
|