feat(ingestion): add content chunking and LLM JSON output parser

This commit is contained in:
Mathias Bergqvist
2026-04-22 22:37:14 +02:00
parent d405346f07
commit 9b11719481
4 changed files with 176 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
// ingestion/internal/pipeline/chunk.go
package pipeline
import "strings"
// Chunk splits content into pieces of at most maxSize bytes, splitting at
// paragraph boundaries (\n\n). If maxSize <= 0, returns content as one chunk.
func Chunk(content string, maxSize int) []string {
content = strings.TrimSpace(content)
if maxSize <= 0 || len(content) <= maxSize {
return []string{content}
}
paragraphs := strings.Split(content, "\n\n")
var chunks []string
var cur strings.Builder
for _, para := range paragraphs {
para = strings.TrimSpace(para)
if para == "" {
continue
}
addition := para
if cur.Len() > 0 {
addition = "\n\n" + para
}
if cur.Len() > 0 && cur.Len()+len(addition) > maxSize {
chunks = append(chunks, cur.String())
cur.Reset()
cur.WriteString(para)
} else {
cur.WriteString(addition)
}
}
if cur.Len() > 0 {
chunks = append(chunks, cur.String())
}
return chunks
}

View File

@@ -0,0 +1,36 @@
// ingestion/internal/pipeline/chunk_test.go
package pipeline
import (
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestChunk_NoChunkingWhenZero(t *testing.T) {
content := strings.Repeat("word ", 1000)
chunks := Chunk(content, 0)
assert.Len(t, chunks, 1)
}
func TestChunk_SplitsAtParagraph(t *testing.T) {
content := "First paragraph here.\n\nSecond paragraph here."
chunks := Chunk(content, 40)
assert.Len(t, chunks, 2)
assert.Equal(t, "First paragraph here.", chunks[0])
assert.Equal(t, "Second paragraph here.", chunks[1])
}
func TestChunk_SingleLargeParagraph(t *testing.T) {
content := strings.Repeat("x", 100)
chunks := Chunk(content, 50)
assert.Len(t, chunks, 1)
}
func TestChunk_NoChunkingWhenContentFits(t *testing.T) {
content := "Short content."
chunks := Chunk(content, 1000)
assert.Len(t, chunks, 1)
assert.Equal(t, "Short content.", chunks[0])
}

View File

@@ -0,0 +1,55 @@
// ingestion/internal/pipeline/parse.go
package pipeline
import (
"encoding/json"
"fmt"
"strings"
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
)
// ParsePages parses LLM output as a JSON array of {path, content} objects.
// If the array is truncated mid-object (token limit), it salvages all complete objects.
func ParsePages(output string) ([]wiki.Page, []string) {
output = strings.TrimSpace(output)
if output == "" {
return nil, []string{"LLM returned empty output"}
}
output = stripFences(output)
var pages []wiki.Page
if err := json.Unmarshal([]byte(output), &pages); err == nil {
return pages, nil
}
// Truncation recovery: find last `}` that closes a complete object.
idx := strings.LastIndex(output, "}")
if idx < 0 {
return nil, []string{"LLM output contained no complete JSON objects"}
}
start := strings.Index(output, "[")
if start < 0 {
return nil, []string{"LLM output contained no JSON array opening bracket"}
}
candidate := output[start:idx+1] + "]"
if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
}
return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
}
func stripFences(s string) string {
for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
if strings.HasPrefix(s, prefix) {
s = strings.TrimPrefix(s, prefix)
s = strings.TrimSuffix(strings.TrimSpace(s), "```")
return strings.TrimSpace(s)
}
}
return s
}

View File

@@ -0,0 +1,46 @@
// ingestion/internal/pipeline/parse_test.go
package pipeline
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestParsePages_ValidJSON(t *testing.T) {
input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"# Bar"}]`
pages, warnings := ParsePages(input)
require.Len(t, pages, 2)
assert.Empty(t, warnings)
assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
assert.Equal(t, "wiki/concepts/bar.md", pages[1].Path)
}
func TestParsePages_StripsFences(t *testing.T) {
input := "```json\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"# Foo\"}]\n```"
pages, warnings := ParsePages(input)
assert.Len(t, pages, 1)
assert.Empty(t, warnings)
}
func TestParsePages_TruncationRecovery(t *testing.T) {
input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"trunc`
pages, warnings := ParsePages(input)
require.Len(t, pages, 1)
assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
assert.NotEmpty(t, warnings)
}
func TestParsePages_EmptyInput(t *testing.T) {
pages, warnings := ParsePages("")
assert.Empty(t, pages)
assert.NotEmpty(t, warnings)
}
func TestParsePages_PlainFence(t *testing.T) {
input := "```\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"ok\"}]\n```"
pages, warnings := ParsePages(input)
assert.Len(t, pages, 1)
assert.Empty(t, warnings)
}