Files
hyperguild/ingestion/internal/search/search_test.go
Mathias 37fdd33b2d
All checks were successful
CI / Lint / Test / Vet (push) Successful in 11s
CI / Mirror to GitHub (push) Has been skipped
feat(ingestion): chunk markdown before embedding (#38)
Long markdown files (>~8KB) silently failed to embed because nomic-embed-text
on iguana has a 2048-token context. embed sync logged errors=1 every cycle
with no useful body until #37 added per-item logging — three files exceed
the ceiling: finbert source (8 KB), koala-machine-state (7.1 KB),
litellm-absorption (8.8 KB). Curated knowledge entries should never be
vector-blind.

Approach: chunk-before-embed, no schema change.

vectorstore/chunk.go (new)
- ChunkMarkdown splits at H1/H2 boundaries; sections over maxBytes are
  further split at paragraph boundaries, packing greedily under budget.
- NumberChunks assigns "<parent>#NNNN" storage paths (1-based, zero-padded
  to 4 digits — handles files with up to ~10k sections in stable sort order).
- ParentPath strips the chunk suffix for retrieval-side dedup.

vectorstore/sync.go
- After ChunkMarkdown produces N pieces, each is embedded + upserted as a
  separate brain_embeddings row at "<parent>#NNNN". maxChunkBytes = 4000
  (≈1000 nomic tokens, well under the 2048 ceiling with headroom for
  unicode/code blocks).
- "Already embedded?" check now reduces known paths to parent set via
  ParentPath, so the first chunk hit short-circuits the file.
- Delete walk also reduces via ParentPath; when a parent file disappears,
  every chunk row (and any pre-existing bare-path row, for backward
  compatibility with rows written before this change) gets dropped.

search/search.go
- hybridMerge collapses chunk-path vector hits to parent via ParentPath
  before scope check, RRF accumulation, and hydration. A file with three
  chunk hits returns one result row, not three.

Backward compatibility: pre-existing bare-path rows in brain_embeddings
keep working — ParentPath returns them unchanged, knownParents handles
them as if they were "wiki/foo.md#NNNN" hits, sync skips re-embed, and
search dedup is a no-op for them. No migration required to ship.

Tests:
- chunk_test.go covers short / heading split / oversized section /
  content preservation / chunk numbering / parent-path stripping.
- sync_test.go adds long-file chunking, single-chunk-row short file,
  skip-if-any-chunk-known, delete-all-chunks-of-disappeared-file.
  Existing tests updated for #NNNN paths.
- search_test.go adds chunk-paths-dedupe-to-parent.

Closes gitea/mathias/infra#38.
2026-05-19 21:57:09 +02:00

190 lines
6.8 KiB
Go

// ingestion/internal/search/search_test.go
package search_test
import (
"context"
"fmt"
"os"
"path/filepath"
"testing"
"github.com/mathiasbq/hyperguild/ingestion/internal/search"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type stubEmbedder struct{ vec []float32 }
func (s stubEmbedder) Embed(_ context.Context, _ string) ([]float32, error) { return s.vec, nil }
type stubVector struct{ hits []search.VectorHit }
func (s stubVector) Search(_ context.Context, _ []float32, _ int) ([]search.VectorHit, error) {
return s.hits, nil
}
func TestSearch_HybridRRFPromotesVectorOnlyHit(t *testing.T) {
dir := t.TempDir()
for _, p := range []struct{ rel, body string }{
// BM25-keyword note (matches "lejpa" once)
{"wiki/jepa-fx/facts/foo.md", "---\ntitle: Foo\n---\nlejpa keyword\n"},
// Semantically related note that does NOT contain the keyword.
{"wiki/jepa-fx/facts/semantic.md", "---\ntitle: Semantic\n---\nNo keyword in body.\n"},
} {
full := filepath.Join(dir, p.rel)
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
require.NoError(t, os.WriteFile(full, []byte(p.body), 0o644))
}
embedder := stubEmbedder{vec: []float32{0.1}}
vector := stubVector{hits: []search.VectorHit{
{Path: "wiki/jepa-fx/facts/semantic.md", Distance: 0.05}, // best vector match
{Path: "wiki/jepa-fx/facts/foo.md", Distance: 0.10},
}}
got, err := search.Query(dir, search.QueryOptions{
Query: "lejpa",
Limit: 5,
Vector: vector,
Embedder: embedder,
})
require.NoError(t, err)
require.Len(t, got, 2, "vector-only hit should be hydrated into results")
paths := []string{got[0].Path, got[1].Path}
assert.Contains(t, paths, "wiki/jepa-fx/facts/foo.md")
assert.Contains(t, paths, "wiki/jepa-fx/facts/semantic.md")
}
func TestSearch_HybridDedupesChunkPathsToParent(t *testing.T) {
dir := t.TempDir()
full := filepath.Join(dir, "knowledge", "long.md")
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
// Body contains the BM25 keyword "alpaca" so hybridMerge actually runs
// (it only kicks in when BM25 returns at least one candidate).
require.NoError(t, os.WriteFile(full, []byte("---\ntitle: Long\n---\nalpaca content.\n"), 0o644))
embedder := stubEmbedder{vec: []float32{0.1}}
// Vector store returns three chunk-path hits all pointing at the same
// parent file. The merged result must surface ONE row per parent — not
// three rows with chunk-suffixed paths.
vector := stubVector{hits: []search.VectorHit{
{Path: "knowledge/long.md#0001", Distance: 0.05},
{Path: "knowledge/long.md#0002", Distance: 0.07},
{Path: "knowledge/long.md#0003", Distance: 0.09},
}}
got, err := search.Query(dir, search.QueryOptions{
Query: "alpaca",
Limit: 5,
Vector: vector,
Embedder: embedder,
})
require.NoError(t, err)
require.Len(t, got, 1, "three chunk hits for one parent must merge to one result")
assert.Equal(t, "knowledge/long.md", got[0].Path)
assert.Equal(t, "Long", got[0].Title)
}
func TestSearch_HybridFallsBackOnEmbedderError(t *testing.T) {
dir := t.TempDir()
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
require.NoError(t, os.WriteFile(filepath.Join(dir, "wiki", "x.md"), []byte("keyword foo"), 0o644))
embedder := errorEmbedder{}
vector := stubVector{}
got, err := search.Query(dir, search.QueryOptions{
Query: "keyword", Limit: 5, Vector: vector, Embedder: embedder,
})
require.NoError(t, err)
require.Len(t, got, 1, "BM25 result should still come back when embedder fails")
assert.Equal(t, "wiki/x.md", got[0].Path)
}
type errorEmbedder struct{}
func (errorEmbedder) Embed(_ context.Context, _ string) ([]float32, error) {
return nil, assert.AnError
}
func TestSearch_ReturnsMatchingPages(t *testing.T) {
dir := t.TempDir()
require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755))
require.NoError(t, os.WriteFile(
filepath.Join(dir, "knowledge", "retry-logic.md"),
[]byte("---\ntitle: Retry Logic\ndomain: software\n---\n\nRetry logic handles transient failures by re-attempting operations.\n"),
0o644,
))
require.NoError(t, os.WriteFile(
filepath.Join(dir, "knowledge", "database.md"),
[]byte("---\ntitle: Database\ndomain: software\n---\n\nA database stores structured data.\n"),
0o644,
))
results, err := search.Query(dir, search.QueryOptions{Query: "retry transient", Limit: 5})
require.NoError(t, err)
require.Len(t, results, 1)
assert.Equal(t, "knowledge/retry-logic.md", results[0].Path)
assert.Equal(t, "Retry Logic", results[0].Title)
assert.Greater(t, results[0].Score, 0)
assert.Contains(t, results[0].Excerpt, "Retry")
}
func TestSearch_WingHallScoping(t *testing.T) {
dir := t.TempDir()
for _, p := range []struct{ rel, body string }{
{"wiki/jepa-fx/decisions/val-vol.md", "---\nwing: jepa-fx\nhall: decisions\n---\nval-vol-r2 keyword.\n"},
{"wiki/jepa-fx/facts/architecture.md", "---\nwing: jepa-fx\nhall: facts\n---\nval-vol-r2 keyword in facts.\n"},
{"wiki/hyperguild/decisions/routing.md", "---\nwing: hyperguild\nhall: decisions\n---\nval-vol-r2 reference.\n"},
{"knowledge/loose.md", "---\n---\nval-vol-r2 in knowledge.\n"},
} {
full := filepath.Join(dir, p.rel)
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
require.NoError(t, os.WriteFile(full, []byte(p.body), 0o644))
}
// No filter: walk both knowledge/ and wiki/ — all 4 match.
got, err := search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10})
require.NoError(t, err)
assert.Len(t, got, 4)
// Wing scope: 2 jepa-fx hits, no hyperguild, no knowledge.
got, err = search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10, Wing: "jepa-fx"})
require.NoError(t, err)
require.Len(t, got, 2)
for _, r := range got {
assert.Equal(t, "jepa-fx", r.Wing)
}
// Wing+Hall scope: 1 hit.
got, err = search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10, Wing: "jepa-fx", Hall: "decisions"})
require.NoError(t, err)
require.Len(t, got, 1)
assert.Equal(t, "jepa-fx", got[0].Wing)
assert.Equal(t, "decisions", got[0].Hall)
assert.Equal(t, "wiki/jepa-fx/decisions/val-vol.md", got[0].Path)
// Invalid hall rejected.
_, err = search.Query(dir, search.QueryOptions{Query: "x", Wing: "jepa-fx", Hall: "garbage"})
require.Error(t, err)
// Hall without wing rejected.
_, err = search.Query(dir, search.QueryOptions{Query: "x", Hall: "facts"})
require.Error(t, err)
}
func TestSearch_RespectsLimit(t *testing.T) {
dir := t.TempDir()
require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755))
for i := 0; i < 5; i++ {
require.NoError(t, os.WriteFile(
filepath.Join(dir, "knowledge", fmt.Sprintf("page-%d.md", i)),
[]byte(fmt.Sprintf("---\ntitle: Page %d\n---\n\nThis page mentions retry.\n", i)),
0o644,
))
}
results, err := search.Query(dir, search.QueryOptions{Query: "retry", Limit: 3})
require.NoError(t, err)
assert.LessOrEqual(t, len(results), 3)
}