Removes the TODO in Sync that left files static after their first embed. Edits to brain/wiki/ and brain/knowledge/ now surface in subsequent syncs without manual /backfill-embeddings calls. Approach - Store interface: KnownPaths → KnownPathsWithTime returning path → updated_at. Callers compare against file mtime to detect edits. - PGStore: SELECT path, updated_at FROM brain_embeddings. - Sync groups known chunks by parent path and tracks the EARLIEST updated_at per parent. A file is stale when its mtime is after that oldest chunk's timestamp — any chunk older than the file means at least one chunk hasn't been refreshed since the last edit. - Stale-path rewrite: delete every old chunk for the parent (handles "file shrunk → fewer chunks → orphan rows at higher #NNNN" cleanly), then re-chunk + re-embed + re-upsert. Tests - New: TestSync_ReembedsFileWhenMtimeNewer — file mtime forced into the future vs store updated_at; Sync deletes old chunk + upserts fresh one. - New: TestSync_SkipsFileWhenMtimeOlder — file mtime backdated; Sync is a no-op (no upserts, no deletes). - Updated: stubStore.known is now map[string]time.Time. A zero value resolves to a far-future sentinel so existing "skip if already known" tests keep passing without per-test setup. - pg_test renamed KnownPaths integration → KnownPathsWithTime; asserts updated_at is non-zero and within 5s of insert wall-clock. Backward compat - brain_embeddings rows pre-dating this change carry valid updated_at values (column was always populated via `DEFAULT now()` + ON CONFLICT `updated_at = now()`). No migration needed. Live pod will start re-embedding any file whose source has been edited since its chunks were originally written. Closes gitea/mathias/hyperguild#23.
95 lines
2.5 KiB
Go
95 lines
2.5 KiB
Go
package vectorstore_test
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
// integration tests run against a real postgres18 + pgvector. Gated by
|
|
// BRAIN_PG_TEST_DSN so `task check` stays hermetic on hosts without a
|
|
// reachable database.
|
|
//
|
|
// To run:
|
|
// BRAIN_PG_TEST_DSN='postgres://brain_app:pwd@127.0.0.1:5432/brain' \
|
|
// go test ./internal/vectorstore/... -run Integration
|
|
func dsn(t *testing.T) string {
|
|
t.Helper()
|
|
v := os.Getenv("BRAIN_PG_TEST_DSN")
|
|
if v == "" {
|
|
t.Skip("BRAIN_PG_TEST_DSN not set; skipping pgvector integration tests")
|
|
}
|
|
return v
|
|
}
|
|
|
|
func freshStore(t *testing.T) (*vectorstore.PGStore, context.Context) {
|
|
t.Helper()
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
t.Cleanup(cancel)
|
|
s, err := vectorstore.New(ctx, dsn(t))
|
|
require.NoError(t, err)
|
|
t.Cleanup(s.Close)
|
|
require.NoError(t, s.Init(ctx))
|
|
// Clean slate per test.
|
|
_, _ = s.KnownPathsWithTime(ctx)
|
|
require.NoError(t, s.Delete(ctx, "%test-fixture%"))
|
|
return s, ctx
|
|
}
|
|
|
|
func vec(dim int, fill float32) []float32 {
|
|
v := make([]float32, dim)
|
|
for i := range v {
|
|
v[i] = fill
|
|
}
|
|
return v
|
|
}
|
|
|
|
func TestIntegration_UpsertAndSearch(t *testing.T) {
|
|
s, ctx := freshStore(t)
|
|
|
|
require.NoError(t, s.Upsert(ctx, "wiki/a.md", vec(768, 1.0)))
|
|
require.NoError(t, s.Upsert(ctx, "wiki/b.md", vec(768, -1.0)))
|
|
|
|
hits, err := s.Search(ctx, vec(768, 1.0), 2)
|
|
require.NoError(t, err)
|
|
require.GreaterOrEqual(t, len(hits), 1)
|
|
assert.Equal(t, "wiki/a.md", hits[0].Path)
|
|
assert.InDelta(t, 0.0, hits[0].Distance, 1e-5)
|
|
|
|
t.Cleanup(func() {
|
|
_ = s.Delete(ctx, "wiki/a.md")
|
|
_ = s.Delete(ctx, "wiki/b.md")
|
|
})
|
|
}
|
|
|
|
func TestIntegration_KnownPathsWithTime(t *testing.T) {
|
|
s, ctx := freshStore(t)
|
|
before := time.Now()
|
|
require.NoError(t, s.Upsert(ctx, "wiki/k.md", vec(768, 0.5)))
|
|
t.Cleanup(func() { _ = s.Delete(ctx, "wiki/k.md") })
|
|
|
|
paths, err := s.KnownPathsWithTime(ctx)
|
|
require.NoError(t, err)
|
|
at, ok := paths["wiki/k.md"]
|
|
require.True(t, ok)
|
|
assert.False(t, at.IsZero(), "updated_at must not be zero")
|
|
assert.WithinDuration(t, before, at, 5*time.Second, "updated_at must be recent")
|
|
}
|
|
|
|
func TestUpsert_RejectsWrongDimension(t *testing.T) {
|
|
s := &vectorstore.PGStore{}
|
|
err := s.Upsert(context.Background(), "x", vec(100, 0))
|
|
require.Error(t, err)
|
|
}
|
|
|
|
func TestSearch_RejectsWrongDimension(t *testing.T) {
|
|
s := &vectorstore.PGStore{}
|
|
_, err := s.Search(context.Background(), vec(100, 0), 5)
|
|
require.Error(t, err)
|
|
}
|