feat(ingestion): chunk markdown before embedding (#38)

Long markdown files (>~8KB) silently failed to embed because nomic-embed-text on iguana has a 2048-token context. embed sync logged errors=1 every cycle with no useful body until #37 added per-item logging — three files exceed the ceiling: finbert source (8 KB), koala-machine-state (7.1 KB), litellm-absorption (8.8 KB). Curated knowledge entries should never be vector-blind. Approach: chunk-before-embed, no schema change. vectorstore/chunk.go (new) - ChunkMarkdown splits at H1/H2 boundaries; sections over maxBytes are further split at paragraph boundaries, packing greedily under budget. - NumberChunks assigns "<parent>#NNNN" storage paths (1-based, zero-padded to 4 digits — handles files with up to ~10k sections in stable sort order). - ParentPath strips the chunk suffix for retrieval-side dedup. vectorstore/sync.go - After ChunkMarkdown produces N pieces, each is embedded + upserted as a separate brain_embeddings row at "<parent>#NNNN". maxChunkBytes = 4000 (≈1000 nomic tokens, well under the 2048 ceiling with headroom for unicode/code blocks). - "Already embedded?" check now reduces known paths to parent set via ParentPath, so the first chunk hit short-circuits the file. - Delete walk also reduces via ParentPath; when a parent file disappears, every chunk row (and any pre-existing bare-path row, for backward compatibility with rows written before this change) gets dropped. search/search.go - hybridMerge collapses chunk-path vector hits to parent via ParentPath before scope check, RRF accumulation, and hydration. A file with three chunk hits returns one result row, not three. Backward compatibility: pre-existing bare-path rows in brain_embeddings keep working — ParentPath returns them unchanged, knownParents handles them as if they were "wiki/foo.md#NNNN" hits, sync skips re-embed, and search dedup is a no-op for them. No migration required to ship. Tests: - chunk_test.go covers short / heading split / oversized section / content preservation / chunk numbering / parent-path stripping. - sync_test.go adds long-file chunking, single-chunk-row short file, skip-if-any-chunk-known, delete-all-chunks-of-disappeared-file. Existing tests updated for #NNNN paths. - search_test.go adds chunk-paths-dedupe-to-parent. Closes gitea/mathias/infra#38.
2026-05-19 21:57:09 +02:00
parent 078ec029da
commit 37fdd33b2d
6 changed files with 358 additions and 29 deletions
--- a/ingestion/internal/search/search.go
+++ b/ingestion/internal/search/search.go
@@ -12,6 +12,7 @@ import (
 	"strings"

 	"github.com/mathiasbq/hyperguild/ingestion/internal/brain"
+	"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
 )

 // VectorSearcher returns the top-limit nearest paths by cosine
@@ -186,17 +187,21 @@ func hybridMerge(ctx context.Context, brainDir string, opts QueryOptions, bm25 [
 		byPath[r.Path] = r
 	}
 	for rank, h := range hits {
-		if opts.Wing != "" && !pathInScope(h.Path, opts.Wing, opts.Hall) {
+		// Vector store keys are chunk paths ("wiki/foo.md#0001"); collapse
+		// back to the parent so multiple chunk hits from the same file
+		// score against a single result row.
+		parent := vectorstore.ParentPath(h.Path)
+		if opts.Wing != "" && !pathInScope(parent, opts.Wing, opts.Hall) {
 			continue
 		}
-		rrf[h.Path] += 1.0 / (rrfK + float64(rank+1))
-		if _, seen := byPath[h.Path]; !seen {
-			r, err := hydrate(brainDir, h.Path)
+		rrf[parent] += 1.0 / (rrfK + float64(rank+1))
+		if _, seen := byPath[parent]; !seen {
+			r, err := hydrate(brainDir, parent)
 			if err != nil {
-				slog.Warn("search: hydrate failed for vector hit", "path", h.Path, "err", err)
+				slog.Warn("search: hydrate failed for vector hit", "path", parent, "err", err)
 				continue
 			}
-			byPath[h.Path] = r
+			byPath[parent] = r
 		}
 	}

--- a/ingestion/internal/search/search_test.go
+++ b/ingestion/internal/search/search_test.go
@@ -55,6 +55,36 @@ func TestSearch_HybridRRFPromotesVectorOnlyHit(t *testing.T) {
 	assert.Contains(t, paths, "wiki/jepa-fx/facts/semantic.md")
 }

+func TestSearch_HybridDedupesChunkPathsToParent(t *testing.T) {
+	dir := t.TempDir()
+	full := filepath.Join(dir, "knowledge", "long.md")
+	require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
+	// Body contains the BM25 keyword "alpaca" so hybridMerge actually runs
+	// (it only kicks in when BM25 returns at least one candidate).
+	require.NoError(t, os.WriteFile(full, []byte("---\ntitle: Long\n---\nalpaca content.\n"), 0o644))
+
+	embedder := stubEmbedder{vec: []float32{0.1}}
+	// Vector store returns three chunk-path hits all pointing at the same
+	// parent file. The merged result must surface ONE row per parent — not
+	// three rows with chunk-suffixed paths.
+	vector := stubVector{hits: []search.VectorHit{
+		{Path: "knowledge/long.md#0001", Distance: 0.05},
+		{Path: "knowledge/long.md#0002", Distance: 0.07},
+		{Path: "knowledge/long.md#0003", Distance: 0.09},
+	}}
+
+	got, err := search.Query(dir, search.QueryOptions{
+		Query:    "alpaca",
+		Limit:    5,
+		Vector:   vector,
+		Embedder: embedder,
+	})
+	require.NoError(t, err)
+	require.Len(t, got, 1, "three chunk hits for one parent must merge to one result")
+	assert.Equal(t, "knowledge/long.md", got[0].Path)
+	assert.Equal(t, "Long", got[0].Title)
+}
+
 func TestSearch_HybridFallsBackOnEmbedderError(t *testing.T) {
 	dir := t.TempDir()
 	require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))