hyperguild/ingestion/internal/vectorstore/chunk.go

package vectorstore

import (
	"fmt"
	"strings"
)

// NumberedChunk pairs a chunk's body with the storage path it will use
// in brain_embeddings. Path format: "<parent>#NNNN" where NNNN is the
// 1-based chunk index zero-padded to 4 digits.
type NumberedChunk struct {
	Path    string
	Content string
}

// ParentPath returns the file path with any "#NNNN" chunk suffix removed.
// Inputs without a "#" are returned unchanged. Used by search to dedupe
// chunk-level hits back to a single document per result.
func ParentPath(p string) string {
	if i := strings.Index(p, "#"); i >= 0 {
		return p[:i]
	}
	return p
}

// NumberChunks assigns "<parent>#NNNN" storage paths to a slice of chunk
// bodies, indexed from 0001. Empty chunks are dropped.
func NumberChunks(parent string, chunks []string) []NumberedChunk {
	out := make([]NumberedChunk, 0, len(chunks))
	idx := 1
	for _, c := range chunks {
		if strings.TrimSpace(c) == "" {
			continue
		}
		out = append(out, NumberedChunk{
			Path:    fmt.Sprintf("%s#%04d", parent, idx),
			Content: c,
		})
		idx++
	}
	return out
}

// ChunkMarkdown splits a markdown document into embedding-sized pieces.
// Strategy:
//  1. Split at H1/H2 headings (top-of-line "#" or "##"). The intro before
//     the first heading is its own chunk.
//  2. Any section larger than maxBytes is further split at paragraph
//     boundaries (blank lines), packing paragraphs greedily under the
//     byte budget.
//
// The function aims for "fits comfortably under nomic-embed-text's 2048-
// token context" — at ~4 chars/token for English markdown, maxBytes ≈ 4000
// is a safe call-site default.
func ChunkMarkdown(content string, maxBytes int) []string {
	if maxBytes <= 0 {
		maxBytes = 4000
	}
	sections := splitAtHeadings(content)

	out := make([]string, 0, len(sections))
	for _, s := range sections {
		if len(s) <= maxBytes {
			out = append(out, s)
			continue
		}
		out = append(out, splitAtParagraphs(s, maxBytes)...)
	}
	return out
}

// splitAtHeadings cuts content into sections that each start with an
// "# " or "## " line (intro before any heading is the leading section).
func splitAtHeadings(content string) []string {
	lines := strings.Split(content, "\n")
	var sections []string
	var cur strings.Builder
	flush := func() {
		if cur.Len() == 0 {
			return
		}
		// Trim all trailing whitespace then re-add a single newline so a
		// single-paragraph file round-trips to its original content rather
		// than accumulating extra newlines from the empty-line split.
		s := strings.TrimRight(cur.String(), "\n")
		sections = append(sections, s+"\n")
		cur.Reset()
	}
	for _, ln := range lines {
		trimmed := strings.TrimLeft(ln, " ")
		isH := strings.HasPrefix(trimmed, "# ") || strings.HasPrefix(trimmed, "## ")
		if isH && cur.Len() > 0 {
			flush()
		}
		cur.WriteString(ln)
		cur.WriteByte('\n')
	}
	flush()
	// Drop empty / whitespace-only trailing section (common when content
	// itself ends with a "\n" — Split leaves a final empty element).
	if n := len(sections); n > 0 && strings.TrimSpace(sections[n-1]) == "" {
		sections = sections[:n-1]
	}
	return sections
}

// splitAtParagraphs packs paragraphs (blank-line separated blocks) into
// sub-chunks of at most maxBytes. A single paragraph that itself exceeds
// maxBytes is emitted as one over-budget chunk rather than being split
// mid-sentence — better to over-spend a little than truncate prose.
func splitAtParagraphs(section string, maxBytes int) []string {
	paras := strings.Split(section, "\n\n")
	var out []string
	var cur strings.Builder
	for _, p := range paras {
		if p == "" {
			continue
		}
		// +2 for the "\n\n" rejoin if cur isn't empty
		need := len(p)
		if cur.Len() > 0 {
			need += 2
		}
		if cur.Len() > 0 && cur.Len()+need > maxBytes {
			out = append(out, cur.String())
			cur.Reset()
		}
		if cur.Len() > 0 {
			cur.WriteString("\n\n")
		}
		cur.WriteString(p)
	}
	if cur.Len() > 0 {
		out = append(out, cur.String())
	}
	return out
}