Long markdown files (>~8KB) silently failed to embed because nomic-embed-text on iguana has a 2048-token context. embed sync logged errors=1 every cycle with no useful body until #37 added per-item logging — three files exceed the ceiling: finbert source (8 KB), koala-machine-state (7.1 KB), litellm-absorption (8.8 KB). Curated knowledge entries should never be vector-blind. Approach: chunk-before-embed, no schema change. vectorstore/chunk.go (new) - ChunkMarkdown splits at H1/H2 boundaries; sections over maxBytes are further split at paragraph boundaries, packing greedily under budget. - NumberChunks assigns "<parent>#NNNN" storage paths (1-based, zero-padded to 4 digits — handles files with up to ~10k sections in stable sort order). - ParentPath strips the chunk suffix for retrieval-side dedup. vectorstore/sync.go - After ChunkMarkdown produces N pieces, each is embedded + upserted as a separate brain_embeddings row at "<parent>#NNNN". maxChunkBytes = 4000 (≈1000 nomic tokens, well under the 2048 ceiling with headroom for unicode/code blocks). - "Already embedded?" check now reduces known paths to parent set via ParentPath, so the first chunk hit short-circuits the file. - Delete walk also reduces via ParentPath; when a parent file disappears, every chunk row (and any pre-existing bare-path row, for backward compatibility with rows written before this change) gets dropped. search/search.go - hybridMerge collapses chunk-path vector hits to parent via ParentPath before scope check, RRF accumulation, and hydration. A file with three chunk hits returns one result row, not three. Backward compatibility: pre-existing bare-path rows in brain_embeddings keep working — ParentPath returns them unchanged, knownParents handles them as if they were "wiki/foo.md#NNNN" hits, sync skips re-embed, and search dedup is a no-op for them. No migration required to ship. Tests: - chunk_test.go covers short / heading split / oversized section / content preservation / chunk numbering / parent-path stripping. - sync_test.go adds long-file chunking, single-chunk-row short file, skip-if-any-chunk-known, delete-all-chunks-of-disappeared-file. Existing tests updated for #NNNN paths. - search_test.go adds chunk-paths-dedupe-to-parent. Closes gitea/mathias/infra#38.
138 lines
3.9 KiB
Go
138 lines
3.9 KiB
Go
package vectorstore
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// NumberedChunk pairs a chunk's body with the storage path it will use
|
|
// in brain_embeddings. Path format: "<parent>#NNNN" where NNNN is the
|
|
// 1-based chunk index zero-padded to 4 digits.
|
|
type NumberedChunk struct {
|
|
Path string
|
|
Content string
|
|
}
|
|
|
|
// ParentPath returns the file path with any "#NNNN" chunk suffix removed.
|
|
// Inputs without a "#" are returned unchanged. Used by search to dedupe
|
|
// chunk-level hits back to a single document per result.
|
|
func ParentPath(p string) string {
|
|
if i := strings.Index(p, "#"); i >= 0 {
|
|
return p[:i]
|
|
}
|
|
return p
|
|
}
|
|
|
|
// NumberChunks assigns "<parent>#NNNN" storage paths to a slice of chunk
|
|
// bodies, indexed from 0001. Empty chunks are dropped.
|
|
func NumberChunks(parent string, chunks []string) []NumberedChunk {
|
|
out := make([]NumberedChunk, 0, len(chunks))
|
|
idx := 1
|
|
for _, c := range chunks {
|
|
if strings.TrimSpace(c) == "" {
|
|
continue
|
|
}
|
|
out = append(out, NumberedChunk{
|
|
Path: fmt.Sprintf("%s#%04d", parent, idx),
|
|
Content: c,
|
|
})
|
|
idx++
|
|
}
|
|
return out
|
|
}
|
|
|
|
// ChunkMarkdown splits a markdown document into embedding-sized pieces.
|
|
// Strategy:
|
|
// 1. Split at H1/H2 headings (top-of-line "#" or "##"). The intro before
|
|
// the first heading is its own chunk.
|
|
// 2. Any section larger than maxBytes is further split at paragraph
|
|
// boundaries (blank lines), packing paragraphs greedily under the
|
|
// byte budget.
|
|
//
|
|
// The function aims for "fits comfortably under nomic-embed-text's 2048-
|
|
// token context" — at ~4 chars/token for English markdown, maxBytes ≈ 4000
|
|
// is a safe call-site default.
|
|
func ChunkMarkdown(content string, maxBytes int) []string {
|
|
if maxBytes <= 0 {
|
|
maxBytes = 4000
|
|
}
|
|
sections := splitAtHeadings(content)
|
|
|
|
out := make([]string, 0, len(sections))
|
|
for _, s := range sections {
|
|
if len(s) <= maxBytes {
|
|
out = append(out, s)
|
|
continue
|
|
}
|
|
out = append(out, splitAtParagraphs(s, maxBytes)...)
|
|
}
|
|
return out
|
|
}
|
|
|
|
// splitAtHeadings cuts content into sections that each start with an
|
|
// "# " or "## " line (intro before any heading is the leading section).
|
|
func splitAtHeadings(content string) []string {
|
|
lines := strings.Split(content, "\n")
|
|
var sections []string
|
|
var cur strings.Builder
|
|
flush := func() {
|
|
if cur.Len() == 0 {
|
|
return
|
|
}
|
|
// Trim all trailing whitespace then re-add a single newline so a
|
|
// single-paragraph file round-trips to its original content rather
|
|
// than accumulating extra newlines from the empty-line split.
|
|
s := strings.TrimRight(cur.String(), "\n")
|
|
sections = append(sections, s+"\n")
|
|
cur.Reset()
|
|
}
|
|
for _, ln := range lines {
|
|
trimmed := strings.TrimLeft(ln, " ")
|
|
isH := strings.HasPrefix(trimmed, "# ") || strings.HasPrefix(trimmed, "## ")
|
|
if isH && cur.Len() > 0 {
|
|
flush()
|
|
}
|
|
cur.WriteString(ln)
|
|
cur.WriteByte('\n')
|
|
}
|
|
flush()
|
|
// Drop empty / whitespace-only trailing section (common when content
|
|
// itself ends with a "\n" — Split leaves a final empty element).
|
|
if n := len(sections); n > 0 && strings.TrimSpace(sections[n-1]) == "" {
|
|
sections = sections[:n-1]
|
|
}
|
|
return sections
|
|
}
|
|
|
|
// splitAtParagraphs packs paragraphs (blank-line separated blocks) into
|
|
// sub-chunks of at most maxBytes. A single paragraph that itself exceeds
|
|
// maxBytes is emitted as one over-budget chunk rather than being split
|
|
// mid-sentence — better to over-spend a little than truncate prose.
|
|
func splitAtParagraphs(section string, maxBytes int) []string {
|
|
paras := strings.Split(section, "\n\n")
|
|
var out []string
|
|
var cur strings.Builder
|
|
for _, p := range paras {
|
|
if p == "" {
|
|
continue
|
|
}
|
|
// +2 for the "\n\n" rejoin if cur isn't empty
|
|
need := len(p)
|
|
if cur.Len() > 0 {
|
|
need += 2
|
|
}
|
|
if cur.Len() > 0 && cur.Len()+need > maxBytes {
|
|
out = append(out, cur.String())
|
|
cur.Reset()
|
|
}
|
|
if cur.Len() > 0 {
|
|
cur.WriteString("\n\n")
|
|
}
|
|
cur.WriteString(p)
|
|
}
|
|
if cur.Len() > 0 {
|
|
out = append(out, cur.String())
|
|
}
|
|
return out
|
|
}
|