feat(ingestion): chunk markdown before embedding (#38)
Long markdown files (>~8KB) silently failed to embed because nomic-embed-text on iguana has a 2048-token context. embed sync logged errors=1 every cycle with no useful body until #37 added per-item logging — three files exceed the ceiling: finbert source (8 KB), koala-machine-state (7.1 KB), litellm-absorption (8.8 KB). Curated knowledge entries should never be vector-blind. Approach: chunk-before-embed, no schema change. vectorstore/chunk.go (new) - ChunkMarkdown splits at H1/H2 boundaries; sections over maxBytes are further split at paragraph boundaries, packing greedily under budget. - NumberChunks assigns "<parent>#NNNN" storage paths (1-based, zero-padded to 4 digits — handles files with up to ~10k sections in stable sort order). - ParentPath strips the chunk suffix for retrieval-side dedup. vectorstore/sync.go - After ChunkMarkdown produces N pieces, each is embedded + upserted as a separate brain_embeddings row at "<parent>#NNNN". maxChunkBytes = 4000 (≈1000 nomic tokens, well under the 2048 ceiling with headroom for unicode/code blocks). - "Already embedded?" check now reduces known paths to parent set via ParentPath, so the first chunk hit short-circuits the file. - Delete walk also reduces via ParentPath; when a parent file disappears, every chunk row (and any pre-existing bare-path row, for backward compatibility with rows written before this change) gets dropped. search/search.go - hybridMerge collapses chunk-path vector hits to parent via ParentPath before scope check, RRF accumulation, and hydration. A file with three chunk hits returns one result row, not three. Backward compatibility: pre-existing bare-path rows in brain_embeddings keep working — ParentPath returns them unchanged, knownParents handles them as if they were "wiki/foo.md#NNNN" hits, sync skips re-embed, and search dedup is a no-op for them. No migration required to ship. Tests: - chunk_test.go covers short / heading split / oversized section / content preservation / chunk numbering / parent-path stripping. - sync_test.go adds long-file chunking, single-chunk-row short file, skip-if-any-chunk-known, delete-all-chunks-of-disappeared-file. Existing tests updated for #NNNN paths. - search_test.go adds chunk-paths-dedupe-to-parent. Closes gitea/mathias/infra#38.
This commit is contained in:
@@ -37,6 +37,13 @@ type SyncResult struct {
|
||||
// source pages; knowledge/ holds curated hand-written entries.
|
||||
var scanDirs = []string{"wiki", "knowledge"}
|
||||
|
||||
// maxChunkBytes is the per-chunk byte budget passed to ChunkMarkdown.
|
||||
// Sized to fit comfortably under nomic-embed-text's 2048-token default
|
||||
// context (~4 chars/token for English markdown → ~8 KB ceiling; we sit
|
||||
// at 4 KB to leave headroom for unicode, code blocks, and tokenizer
|
||||
// variance).
|
||||
const maxChunkBytes = 4000
|
||||
|
||||
// Sync brings the embedding store in line with brain/{wiki,knowledge}/
|
||||
// on disk:
|
||||
// - new files (in the tree, not in the store) get embedded + upserted
|
||||
@@ -55,7 +62,13 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("known paths: %w", err)
|
||||
}
|
||||
seen := make(map[string]struct{})
|
||||
// Build a parent → "any chunk known?" set so we can skip files that
|
||||
// already have at least one chunk row in the store.
|
||||
knownParents := make(map[string]struct{}, len(known))
|
||||
for p := range known {
|
||||
knownParents[ParentPath(p)] = struct{}{}
|
||||
}
|
||||
seenParents := make(map[string]struct{})
|
||||
|
||||
for _, sub := range scanDirs {
|
||||
root := filepath.Join(brainDir, sub)
|
||||
@@ -75,11 +88,12 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
|
||||
return err
|
||||
}
|
||||
relSlash := filepath.ToSlash(rel)
|
||||
seen[relSlash] = struct{}{}
|
||||
seenParents[relSlash] = struct{}{}
|
||||
|
||||
if _, ok := known[relSlash]; ok {
|
||||
// Already embedded — TODO: compare mtime once Store exposes
|
||||
// updated_at so we re-embed on edit. For now, skip.
|
||||
if _, ok := knownParents[relSlash]; ok {
|
||||
// File has at least one chunk in the store already.
|
||||
// TODO: compare mtime once Store exposes updated_at so we
|
||||
// re-embed on edit. For now, skip.
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -88,16 +102,19 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
|
||||
res.Errors = append(res.Errors, fmt.Errorf("read %s: %w", relSlash, readErr))
|
||||
return nil
|
||||
}
|
||||
vec, embErr := embedder.Embed(ctx, string(content))
|
||||
if embErr != nil {
|
||||
res.Errors = append(res.Errors, fmt.Errorf("embed %s: %w", relSlash, embErr))
|
||||
return nil
|
||||
chunks := NumberChunks(relSlash, ChunkMarkdown(string(content), maxChunkBytes))
|
||||
for _, ch := range chunks {
|
||||
vec, embErr := embedder.Embed(ctx, ch.Content)
|
||||
if embErr != nil {
|
||||
res.Errors = append(res.Errors, fmt.Errorf("embed %s: %w", ch.Path, embErr))
|
||||
continue
|
||||
}
|
||||
if upErr := store.Upsert(ctx, ch.Path, vec); upErr != nil {
|
||||
res.Errors = append(res.Errors, fmt.Errorf("upsert %s: %w", ch.Path, upErr))
|
||||
continue
|
||||
}
|
||||
res.Added++
|
||||
}
|
||||
if upErr := store.Upsert(ctx, relSlash, vec); upErr != nil {
|
||||
res.Errors = append(res.Errors, fmt.Errorf("upsert %s: %w", relSlash, upErr))
|
||||
return nil
|
||||
}
|
||||
res.Added++
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
@@ -105,9 +122,9 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
|
||||
}
|
||||
}
|
||||
|
||||
// Drop rows whose file is gone.
|
||||
// Drop chunk rows whose parent file is gone.
|
||||
for path := range known {
|
||||
if _, ok := seen[path]; ok {
|
||||
if _, ok := seenParents[ParentPath(path)]; ok {
|
||||
continue
|
||||
}
|
||||
if err := store.Delete(ctx, path); err != nil {
|
||||
|
||||
Reference in New Issue
Block a user