From 9b11719481dfd60ab7d5f724b4f2bfb4acec5ea8 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Wed, 22 Apr 2026 22:37:14 +0200 Subject: [PATCH] feat(ingestion): add content chunking and LLM JSON output parser --- ingestion/internal/pipeline/chunk.go | 39 ++++++++++++++++ ingestion/internal/pipeline/chunk_test.go | 36 +++++++++++++++ ingestion/internal/pipeline/parse.go | 55 +++++++++++++++++++++++ ingestion/internal/pipeline/parse_test.go | 46 +++++++++++++++++++ 4 files changed, 176 insertions(+) create mode 100644 ingestion/internal/pipeline/chunk.go create mode 100644 ingestion/internal/pipeline/chunk_test.go create mode 100644 ingestion/internal/pipeline/parse.go create mode 100644 ingestion/internal/pipeline/parse_test.go diff --git a/ingestion/internal/pipeline/chunk.go b/ingestion/internal/pipeline/chunk.go new file mode 100644 index 0000000..d4145cc --- /dev/null +++ b/ingestion/internal/pipeline/chunk.go @@ -0,0 +1,39 @@ +// ingestion/internal/pipeline/chunk.go +package pipeline + +import "strings" + +// Chunk splits content into pieces of at most maxSize bytes, splitting at +// paragraph boundaries (\n\n). If maxSize <= 0, returns content as one chunk. +func Chunk(content string, maxSize int) []string { + content = strings.TrimSpace(content) + if maxSize <= 0 || len(content) <= maxSize { + return []string{content} + } + + paragraphs := strings.Split(content, "\n\n") + var chunks []string + var cur strings.Builder + + for _, para := range paragraphs { + para = strings.TrimSpace(para) + if para == "" { + continue + } + addition := para + if cur.Len() > 0 { + addition = "\n\n" + para + } + if cur.Len() > 0 && cur.Len()+len(addition) > maxSize { + chunks = append(chunks, cur.String()) + cur.Reset() + cur.WriteString(para) + } else { + cur.WriteString(addition) + } + } + if cur.Len() > 0 { + chunks = append(chunks, cur.String()) + } + return chunks +} diff --git a/ingestion/internal/pipeline/chunk_test.go b/ingestion/internal/pipeline/chunk_test.go new file mode 100644 index 0000000..384efee --- /dev/null +++ b/ingestion/internal/pipeline/chunk_test.go @@ -0,0 +1,36 @@ +// ingestion/internal/pipeline/chunk_test.go +package pipeline + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestChunk_NoChunkingWhenZero(t *testing.T) { + content := strings.Repeat("word ", 1000) + chunks := Chunk(content, 0) + assert.Len(t, chunks, 1) +} + +func TestChunk_SplitsAtParagraph(t *testing.T) { + content := "First paragraph here.\n\nSecond paragraph here." + chunks := Chunk(content, 40) + assert.Len(t, chunks, 2) + assert.Equal(t, "First paragraph here.", chunks[0]) + assert.Equal(t, "Second paragraph here.", chunks[1]) +} + +func TestChunk_SingleLargeParagraph(t *testing.T) { + content := strings.Repeat("x", 100) + chunks := Chunk(content, 50) + assert.Len(t, chunks, 1) +} + +func TestChunk_NoChunkingWhenContentFits(t *testing.T) { + content := "Short content." + chunks := Chunk(content, 1000) + assert.Len(t, chunks, 1) + assert.Equal(t, "Short content.", chunks[0]) +} diff --git a/ingestion/internal/pipeline/parse.go b/ingestion/internal/pipeline/parse.go new file mode 100644 index 0000000..ac9a38d --- /dev/null +++ b/ingestion/internal/pipeline/parse.go @@ -0,0 +1,55 @@ +// ingestion/internal/pipeline/parse.go +package pipeline + +import ( + "encoding/json" + "fmt" + "strings" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +// ParsePages parses LLM output as a JSON array of {path, content} objects. +// If the array is truncated mid-object (token limit), it salvages all complete objects. +func ParsePages(output string) ([]wiki.Page, []string) { + output = strings.TrimSpace(output) + if output == "" { + return nil, []string{"LLM returned empty output"} + } + + output = stripFences(output) + + var pages []wiki.Page + if err := json.Unmarshal([]byte(output), &pages); err == nil { + return pages, nil + } + + // Truncation recovery: find last `}` that closes a complete object. + idx := strings.LastIndex(output, "}") + if idx < 0 { + return nil, []string{"LLM output contained no complete JSON objects"} + } + + start := strings.Index(output, "[") + if start < 0 { + return nil, []string{"LLM output contained no JSON array opening bracket"} + } + + candidate := output[start:idx+1] + "]" + if err := json.Unmarshal([]byte(candidate), &pages); err != nil { + return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)} + } + + return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))} +} + +func stripFences(s string) string { + for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} { + if strings.HasPrefix(s, prefix) { + s = strings.TrimPrefix(s, prefix) + s = strings.TrimSuffix(strings.TrimSpace(s), "```") + return strings.TrimSpace(s) + } + } + return s +} diff --git a/ingestion/internal/pipeline/parse_test.go b/ingestion/internal/pipeline/parse_test.go new file mode 100644 index 0000000..e84464d --- /dev/null +++ b/ingestion/internal/pipeline/parse_test.go @@ -0,0 +1,46 @@ +// ingestion/internal/pipeline/parse_test.go +package pipeline + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParsePages_ValidJSON(t *testing.T) { + input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"# Bar"}]` + pages, warnings := ParsePages(input) + require.Len(t, pages, 2) + assert.Empty(t, warnings) + assert.Equal(t, "wiki/sources/foo.md", pages[0].Path) + assert.Equal(t, "wiki/concepts/bar.md", pages[1].Path) +} + +func TestParsePages_StripsFences(t *testing.T) { + input := "```json\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"# Foo\"}]\n```" + pages, warnings := ParsePages(input) + assert.Len(t, pages, 1) + assert.Empty(t, warnings) +} + +func TestParsePages_TruncationRecovery(t *testing.T) { + input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"trunc` + pages, warnings := ParsePages(input) + require.Len(t, pages, 1) + assert.Equal(t, "wiki/sources/foo.md", pages[0].Path) + assert.NotEmpty(t, warnings) +} + +func TestParsePages_EmptyInput(t *testing.T) { + pages, warnings := ParsePages("") + assert.Empty(t, pages) + assert.NotEmpty(t, warnings) +} + +func TestParsePages_PlainFence(t *testing.T) { + input := "```\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"ok\"}]\n```" + pages, warnings := ParsePages(input) + assert.Len(t, pages, 1) + assert.Empty(t, warnings) +}