package vectorstore_test import ( "strings" "testing" "github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func TestChunkMarkdown_ShortFileFitsInOne(t *testing.T) { out := vectorstore.ChunkMarkdown("Just a short paragraph.\n", 4000) require.Len(t, out, 1) assert.Equal(t, "Just a short paragraph.\n", out[0]) } func TestChunkMarkdown_SplitsAtHeadings(t *testing.T) { src := "# Top\n\nintro\n\n## A\n\nbody a\n\n## B\n\nbody b\n" out := vectorstore.ChunkMarkdown(src, 50) // tiny limit forces per-section split assert.GreaterOrEqual(t, len(out), 2, "should split at H2 boundaries") // Each chunk should start with a heading (top-level intro chunk OK without one) for i, c := range out { if i == 0 { continue } assert.True(t, strings.HasPrefix(strings.TrimSpace(c), "#"), "non-first chunk %d should start with heading: %q", i, c) } } func TestChunkMarkdown_FurtherSplitsOversizedSection(t *testing.T) { // One H2 section with 4 paragraphs of ~80 chars each, limit 100. src := "## big\n\n" + strings.Repeat("paragraph one is moderately long.\n\n", 1) + strings.Repeat("paragraph two also moderately long.\n\n", 1) + strings.Repeat("paragraph three is moderately long.\n\n", 1) + strings.Repeat("paragraph four is moderately long.\n\n", 1) out := vectorstore.ChunkMarkdown(src, 100) assert.Greater(t, len(out), 1, "oversized section should sub-split at paragraph boundaries") for i, c := range out { assert.LessOrEqual(t, len(c), 200, "chunk %d exceeds 2x maxBytes: %d", i, len(c)) } } func TestChunkMarkdown_PreservesContent(t *testing.T) { src := "# H1\n\nfirst section body.\n\n## H2a\n\nsecond section body.\n\n## H2b\n\nthird section body.\n" out := vectorstore.ChunkMarkdown(src, 50) joined := strings.Join(out, "") // All non-whitespace tokens from src must appear in the joined output for _, token := range []string{"H1", "first", "H2a", "second", "H2b", "third"} { assert.Contains(t, joined, token, "token %q missing after chunking", token) } } func TestChunkMarkdown_NumberedSuffix(t *testing.T) { out := vectorstore.NumberChunks("knowledge/foo.md", []string{"a", "b", "c"}) require.Len(t, out, 3) assert.Equal(t, "knowledge/foo.md#0001", out[0].Path) assert.Equal(t, "knowledge/foo.md#0002", out[1].Path) assert.Equal(t, "knowledge/foo.md#0003", out[2].Path) assert.Equal(t, "a", out[0].Content) } func TestParentPath_StripsChunkSuffix(t *testing.T) { assert.Equal(t, "knowledge/foo.md", vectorstore.ParentPath("knowledge/foo.md#0001")) assert.Equal(t, "knowledge/foo.md", vectorstore.ParentPath("knowledge/foo.md")) assert.Equal(t, "wiki/a/b.md", vectorstore.ParentPath("wiki/a/b.md#9999")) }