From 9b11719481dfd60ab7d5f724b4f2bfb4acec5ea8 Mon Sep 17 00:00:00 2001
From: Mathias Bergqvist <mthbqv@gmail.com>
Date: Wed, 22 Apr 2026 22:37:14 +0200
Subject: [PATCH] feat(ingestion): add content chunking and LLM JSON output
 parser

---
 ingestion/internal/pipeline/chunk.go      | 39 ++++++++++++++++
 ingestion/internal/pipeline/chunk_test.go | 36 +++++++++++++++
 ingestion/internal/pipeline/parse.go      | 55 +++++++++++++++++++++++
 ingestion/internal/pipeline/parse_test.go | 46 +++++++++++++++++++
 4 files changed, 176 insertions(+)
 create mode 100644 ingestion/internal/pipeline/chunk.go
 create mode 100644 ingestion/internal/pipeline/chunk_test.go
 create mode 100644 ingestion/internal/pipeline/parse.go
 create mode 100644 ingestion/internal/pipeline/parse_test.go

diff --git a/ingestion/internal/pipeline/chunk.go b/ingestion/internal/pipeline/chunk.go
new file mode 100644
index 0000000..d4145cc
--- /dev/null
+++ b/ingestion/internal/pipeline/chunk.go
@@ -0,0 +1,39 @@
+// ingestion/internal/pipeline/chunk.go
+package pipeline
+
+import "strings"
+
+// Chunk splits content into pieces of at most maxSize bytes, splitting at
+// paragraph boundaries (\n\n). If maxSize <= 0, returns content as one chunk.
+func Chunk(content string, maxSize int) []string {
+	content = strings.TrimSpace(content)
+	if maxSize <= 0 || len(content) <= maxSize {
+		return []string{content}
+	}
+
+	paragraphs := strings.Split(content, "\n\n")
+	var chunks []string
+	var cur strings.Builder
+
+	for _, para := range paragraphs {
+		para = strings.TrimSpace(para)
+		if para == "" {
+			continue
+		}
+		addition := para
+		if cur.Len() > 0 {
+			addition = "\n\n" + para
+		}
+		if cur.Len() > 0 && cur.Len()+len(addition) > maxSize {
+			chunks = append(chunks, cur.String())
+			cur.Reset()
+			cur.WriteString(para)
+		} else {
+			cur.WriteString(addition)
+		}
+	}
+	if cur.Len() > 0 {
+		chunks = append(chunks, cur.String())
+	}
+	return chunks
+}
diff --git a/ingestion/internal/pipeline/chunk_test.go b/ingestion/internal/pipeline/chunk_test.go
new file mode 100644
index 0000000..384efee
--- /dev/null
+++ b/ingestion/internal/pipeline/chunk_test.go
@@ -0,0 +1,36 @@
+// ingestion/internal/pipeline/chunk_test.go
+package pipeline
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestChunk_NoChunkingWhenZero(t *testing.T) {
+	content := strings.Repeat("word ", 1000)
+	chunks := Chunk(content, 0)
+	assert.Len(t, chunks, 1)
+}
+
+func TestChunk_SplitsAtParagraph(t *testing.T) {
+	content := "First paragraph here.\n\nSecond paragraph here."
+	chunks := Chunk(content, 40)
+	assert.Len(t, chunks, 2)
+	assert.Equal(t, "First paragraph here.", chunks[0])
+	assert.Equal(t, "Second paragraph here.", chunks[1])
+}
+
+func TestChunk_SingleLargeParagraph(t *testing.T) {
+	content := strings.Repeat("x", 100)
+	chunks := Chunk(content, 50)
+	assert.Len(t, chunks, 1)
+}
+
+func TestChunk_NoChunkingWhenContentFits(t *testing.T) {
+	content := "Short content."
+	chunks := Chunk(content, 1000)
+	assert.Len(t, chunks, 1)
+	assert.Equal(t, "Short content.", chunks[0])
+}
diff --git a/ingestion/internal/pipeline/parse.go b/ingestion/internal/pipeline/parse.go
new file mode 100644
index 0000000..ac9a38d
--- /dev/null
+++ b/ingestion/internal/pipeline/parse.go
@@ -0,0 +1,55 @@
+// ingestion/internal/pipeline/parse.go
+package pipeline
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
+)
+
+// ParsePages parses LLM output as a JSON array of {path, content} objects.
+// If the array is truncated mid-object (token limit), it salvages all complete objects.
+func ParsePages(output string) ([]wiki.Page, []string) {
+	output = strings.TrimSpace(output)
+	if output == "" {
+		return nil, []string{"LLM returned empty output"}
+	}
+
+	output = stripFences(output)
+
+	var pages []wiki.Page
+	if err := json.Unmarshal([]byte(output), &pages); err == nil {
+		return pages, nil
+	}
+
+	// Truncation recovery: find last `}` that closes a complete object.
+	idx := strings.LastIndex(output, "}")
+	if idx < 0 {
+		return nil, []string{"LLM output contained no complete JSON objects"}
+	}
+
+	start := strings.Index(output, "[")
+	if start < 0 {
+		return nil, []string{"LLM output contained no JSON array opening bracket"}
+	}
+
+	candidate := output[start:idx+1] + "]"
+	if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
+		return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
+	}
+
+	return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
+}
+
+func stripFences(s string) string {
+	for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
+		if strings.HasPrefix(s, prefix) {
+			s = strings.TrimPrefix(s, prefix)
+			s = strings.TrimSuffix(strings.TrimSpace(s), "```")
+			return strings.TrimSpace(s)
+		}
+	}
+	return s
+}
diff --git a/ingestion/internal/pipeline/parse_test.go b/ingestion/internal/pipeline/parse_test.go
new file mode 100644
index 0000000..e84464d
--- /dev/null
+++ b/ingestion/internal/pipeline/parse_test.go
@@ -0,0 +1,46 @@
+// ingestion/internal/pipeline/parse_test.go
+package pipeline
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestParsePages_ValidJSON(t *testing.T) {
+	input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"# Bar"}]`
+	pages, warnings := ParsePages(input)
+	require.Len(t, pages, 2)
+	assert.Empty(t, warnings)
+	assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
+	assert.Equal(t, "wiki/concepts/bar.md", pages[1].Path)
+}
+
+func TestParsePages_StripsFences(t *testing.T) {
+	input := "```json\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"# Foo\"}]\n```"
+	pages, warnings := ParsePages(input)
+	assert.Len(t, pages, 1)
+	assert.Empty(t, warnings)
+}
+
+func TestParsePages_TruncationRecovery(t *testing.T) {
+	input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"trunc`
+	pages, warnings := ParsePages(input)
+	require.Len(t, pages, 1)
+	assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
+	assert.NotEmpty(t, warnings)
+}
+
+func TestParsePages_EmptyInput(t *testing.T) {
+	pages, warnings := ParsePages("")
+	assert.Empty(t, pages)
+	assert.NotEmpty(t, warnings)
+}
+
+func TestParsePages_PlainFence(t *testing.T) {
+	input := "```\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"ok\"}]\n```"
+	pages, warnings := ParsePages(input)
+	assert.Len(t, pages, 1)
+	assert.Empty(t, warnings)
+}