feat(ingestion): add content chunking and LLM JSON output parser

2026-04-22 22:37:14 +02:00
parent d405346f07
commit 9b11719481
4 changed files with 176 additions and 0 deletions
--- a/ingestion/internal/pipeline/chunk.go
+++ b/ingestion/internal/pipeline/chunk.go
@@ -0,0 +1,39 @@
+// ingestion/internal/pipeline/chunk.go
+package pipeline
+
+import "strings"
+
+// Chunk splits content into pieces of at most maxSize bytes, splitting at
+// paragraph boundaries (\n\n). If maxSize <= 0, returns content as one chunk.
+func Chunk(content string, maxSize int) []string {
+	content = strings.TrimSpace(content)
+	if maxSize <= 0 || len(content) <= maxSize {
+		return []string{content}
+	}
+
+	paragraphs := strings.Split(content, "\n\n")
+	var chunks []string
+	var cur strings.Builder
+
+	for _, para := range paragraphs {
+		para = strings.TrimSpace(para)
+		if para == "" {
+			continue
+		}
+		addition := para
+		if cur.Len() > 0 {
+			addition = "\n\n" + para
+		}
+		if cur.Len() > 0 && cur.Len()+len(addition) > maxSize {
+			chunks = append(chunks, cur.String())
+			cur.Reset()
+			cur.WriteString(para)
+		} else {
+			cur.WriteString(addition)
+		}
+	}
+	if cur.Len() > 0 {
+		chunks = append(chunks, cur.String())
+	}
+	return chunks
+}
--- a/ingestion/internal/pipeline/chunk_test.go
+++ b/ingestion/internal/pipeline/chunk_test.go
@@ -0,0 +1,36 @@
+// ingestion/internal/pipeline/chunk_test.go
+package pipeline
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestChunk_NoChunkingWhenZero(t *testing.T) {
+	content := strings.Repeat("word ", 1000)
+	chunks := Chunk(content, 0)
+	assert.Len(t, chunks, 1)
+}
+
+func TestChunk_SplitsAtParagraph(t *testing.T) {
+	content := "First paragraph here.\n\nSecond paragraph here."
+	chunks := Chunk(content, 40)
+	assert.Len(t, chunks, 2)
+	assert.Equal(t, "First paragraph here.", chunks[0])
+	assert.Equal(t, "Second paragraph here.", chunks[1])
+}
+
+func TestChunk_SingleLargeParagraph(t *testing.T) {
+	content := strings.Repeat("x", 100)
+	chunks := Chunk(content, 50)
+	assert.Len(t, chunks, 1)
+}
+
+func TestChunk_NoChunkingWhenContentFits(t *testing.T) {
+	content := "Short content."
+	chunks := Chunk(content, 1000)
+	assert.Len(t, chunks, 1)
+	assert.Equal(t, "Short content.", chunks[0])
+}
--- a/ingestion/internal/pipeline/parse.go
+++ b/ingestion/internal/pipeline/parse.go
@@ -0,0 +1,55 @@
+// ingestion/internal/pipeline/parse.go
+package pipeline
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
+)
+
+// ParsePages parses LLM output as a JSON array of {path, content} objects.
+// If the array is truncated mid-object (token limit), it salvages all complete objects.
+func ParsePages(output string) ([]wiki.Page, []string) {
+	output = strings.TrimSpace(output)
+	if output == "" {
+		return nil, []string{"LLM returned empty output"}
+	}
+
+	output = stripFences(output)
+
+	var pages []wiki.Page
+	if err := json.Unmarshal([]byte(output), &pages); err == nil {
+		return pages, nil
+	}
+
+	// Truncation recovery: find last `}` that closes a complete object.
+	idx := strings.LastIndex(output, "}")
+	if idx < 0 {
+		return nil, []string{"LLM output contained no complete JSON objects"}
+	}
+
+	start := strings.Index(output, "[")
+	if start < 0 {
+		return nil, []string{"LLM output contained no JSON array opening bracket"}
+	}
+
+	candidate := output[start:idx+1] + "]"
+	if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
+		return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
+	}
+
+	return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
+}
+
+func stripFences(s string) string {
+	for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
+		if strings.HasPrefix(s, prefix) {
+			s = strings.TrimPrefix(s, prefix)
+			s = strings.TrimSuffix(strings.TrimSpace(s), "```")
+			return strings.TrimSpace(s)
+		}
+	}
+	return s
+}
--- a/ingestion/internal/pipeline/parse_test.go
+++ b/ingestion/internal/pipeline/parse_test.go
@@ -0,0 +1,46 @@
+// ingestion/internal/pipeline/parse_test.go
+package pipeline
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestParsePages_ValidJSON(t *testing.T) {
+	input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"# Bar"}]`
+	pages, warnings := ParsePages(input)
+	require.Len(t, pages, 2)
+	assert.Empty(t, warnings)
+	assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
+	assert.Equal(t, "wiki/concepts/bar.md", pages[1].Path)
+}
+
+func TestParsePages_StripsFences(t *testing.T) {
+	input := "```json\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"# Foo\"}]\n```"
+	pages, warnings := ParsePages(input)
+	assert.Len(t, pages, 1)
+	assert.Empty(t, warnings)
+}
+
+func TestParsePages_TruncationRecovery(t *testing.T) {
+	input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"trunc`
+	pages, warnings := ParsePages(input)
+	require.Len(t, pages, 1)
+	assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
+	assert.NotEmpty(t, warnings)
+}
+
+func TestParsePages_EmptyInput(t *testing.T) {
+	pages, warnings := ParsePages("")
+	assert.Empty(t, pages)
+	assert.NotEmpty(t, warnings)
+}
+
+func TestParsePages_PlainFence(t *testing.T) {
+	input := "```\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"ok\"}]\n```"
+	pages, warnings := ParsePages(input)
+	assert.Len(t, pages, 1)
+	assert.Empty(t, warnings)
+}