Files
Mathias 815739758e
All checks were successful
CI / Lint / Test / Vet (push) Successful in 11s
CI / Mirror to GitHub (push) Has been skipped
feat(vectorstore): re-embed on file mtime > store updated_at (#23)
Removes the TODO in Sync that left files static after their first embed.
Edits to brain/wiki/ and brain/knowledge/ now surface in subsequent
syncs without manual /backfill-embeddings calls.

Approach
- Store interface: KnownPaths → KnownPathsWithTime returning path →
  updated_at. Callers compare against file mtime to detect edits.
- PGStore: SELECT path, updated_at FROM brain_embeddings.
- Sync groups known chunks by parent path and tracks the EARLIEST
  updated_at per parent. A file is stale when its mtime is after that
  oldest chunk's timestamp — any chunk older than the file means at
  least one chunk hasn't been refreshed since the last edit.
- Stale-path rewrite: delete every old chunk for the parent (handles
  "file shrunk → fewer chunks → orphan rows at higher #NNNN" cleanly),
  then re-chunk + re-embed + re-upsert.

Tests
- New: TestSync_ReembedsFileWhenMtimeNewer — file mtime forced into the
  future vs store updated_at; Sync deletes old chunk + upserts fresh one.
- New: TestSync_SkipsFileWhenMtimeOlder — file mtime backdated; Sync is
  a no-op (no upserts, no deletes).
- Updated: stubStore.known is now map[string]time.Time. A zero value
  resolves to a far-future sentinel so existing "skip if already known"
  tests keep passing without per-test setup.
- pg_test renamed KnownPaths integration → KnownPathsWithTime; asserts
  updated_at is non-zero and within 5s of insert wall-clock.

Backward compat
- brain_embeddings rows pre-dating this change carry valid updated_at
  values (column was always populated via `DEFAULT now()` + ON CONFLICT
  `updated_at = now()`). No migration needed. Live pod will start
  re-embedding any file whose source has been edited since its chunks
  were originally written.

Closes gitea/mathias/hyperguild#23.
2026-05-20 09:50:45 +02:00

95 lines
2.5 KiB
Go

package vectorstore_test
import (
"context"
"os"
"testing"
"time"
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// integration tests run against a real postgres18 + pgvector. Gated by
// BRAIN_PG_TEST_DSN so `task check` stays hermetic on hosts without a
// reachable database.
//
// To run:
// BRAIN_PG_TEST_DSN='postgres://brain_app:pwd@127.0.0.1:5432/brain' \
// go test ./internal/vectorstore/... -run Integration
func dsn(t *testing.T) string {
t.Helper()
v := os.Getenv("BRAIN_PG_TEST_DSN")
if v == "" {
t.Skip("BRAIN_PG_TEST_DSN not set; skipping pgvector integration tests")
}
return v
}
func freshStore(t *testing.T) (*vectorstore.PGStore, context.Context) {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
t.Cleanup(cancel)
s, err := vectorstore.New(ctx, dsn(t))
require.NoError(t, err)
t.Cleanup(s.Close)
require.NoError(t, s.Init(ctx))
// Clean slate per test.
_, _ = s.KnownPathsWithTime(ctx)
require.NoError(t, s.Delete(ctx, "%test-fixture%"))
return s, ctx
}
func vec(dim int, fill float32) []float32 {
v := make([]float32, dim)
for i := range v {
v[i] = fill
}
return v
}
func TestIntegration_UpsertAndSearch(t *testing.T) {
s, ctx := freshStore(t)
require.NoError(t, s.Upsert(ctx, "wiki/a.md", vec(768, 1.0)))
require.NoError(t, s.Upsert(ctx, "wiki/b.md", vec(768, -1.0)))
hits, err := s.Search(ctx, vec(768, 1.0), 2)
require.NoError(t, err)
require.GreaterOrEqual(t, len(hits), 1)
assert.Equal(t, "wiki/a.md", hits[0].Path)
assert.InDelta(t, 0.0, hits[0].Distance, 1e-5)
t.Cleanup(func() {
_ = s.Delete(ctx, "wiki/a.md")
_ = s.Delete(ctx, "wiki/b.md")
})
}
func TestIntegration_KnownPathsWithTime(t *testing.T) {
s, ctx := freshStore(t)
before := time.Now()
require.NoError(t, s.Upsert(ctx, "wiki/k.md", vec(768, 0.5)))
t.Cleanup(func() { _ = s.Delete(ctx, "wiki/k.md") })
paths, err := s.KnownPathsWithTime(ctx)
require.NoError(t, err)
at, ok := paths["wiki/k.md"]
require.True(t, ok)
assert.False(t, at.IsZero(), "updated_at must not be zero")
assert.WithinDuration(t, before, at, 5*time.Second, "updated_at must be recent")
}
func TestUpsert_RejectsWrongDimension(t *testing.T) {
s := &vectorstore.PGStore{}
err := s.Upsert(context.Background(), "x", vec(100, 0))
require.Error(t, err)
}
func TestSearch_RejectsWrongDimension(t *testing.T) {
s := &vectorstore.PGStore{}
_, err := s.Search(context.Background(), vec(100, 0), 5)
require.Error(t, err)
}