feat(ingestion): add content chunking and LLM JSON output parser
This commit is contained in:
39
ingestion/internal/pipeline/chunk.go
Normal file
39
ingestion/internal/pipeline/chunk.go
Normal file
@@ -0,0 +1,39 @@
|
||||
// ingestion/internal/pipeline/chunk.go
|
||||
package pipeline
|
||||
|
||||
import "strings"
|
||||
|
||||
// Chunk splits content into pieces of at most maxSize bytes, splitting at
|
||||
// paragraph boundaries (\n\n). If maxSize <= 0, returns content as one chunk.
|
||||
func Chunk(content string, maxSize int) []string {
|
||||
content = strings.TrimSpace(content)
|
||||
if maxSize <= 0 || len(content) <= maxSize {
|
||||
return []string{content}
|
||||
}
|
||||
|
||||
paragraphs := strings.Split(content, "\n\n")
|
||||
var chunks []string
|
||||
var cur strings.Builder
|
||||
|
||||
for _, para := range paragraphs {
|
||||
para = strings.TrimSpace(para)
|
||||
if para == "" {
|
||||
continue
|
||||
}
|
||||
addition := para
|
||||
if cur.Len() > 0 {
|
||||
addition = "\n\n" + para
|
||||
}
|
||||
if cur.Len() > 0 && cur.Len()+len(addition) > maxSize {
|
||||
chunks = append(chunks, cur.String())
|
||||
cur.Reset()
|
||||
cur.WriteString(para)
|
||||
} else {
|
||||
cur.WriteString(addition)
|
||||
}
|
||||
}
|
||||
if cur.Len() > 0 {
|
||||
chunks = append(chunks, cur.String())
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
36
ingestion/internal/pipeline/chunk_test.go
Normal file
36
ingestion/internal/pipeline/chunk_test.go
Normal file
@@ -0,0 +1,36 @@
|
||||
// ingestion/internal/pipeline/chunk_test.go
|
||||
package pipeline
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestChunk_NoChunkingWhenZero(t *testing.T) {
|
||||
content := strings.Repeat("word ", 1000)
|
||||
chunks := Chunk(content, 0)
|
||||
assert.Len(t, chunks, 1)
|
||||
}
|
||||
|
||||
func TestChunk_SplitsAtParagraph(t *testing.T) {
|
||||
content := "First paragraph here.\n\nSecond paragraph here."
|
||||
chunks := Chunk(content, 40)
|
||||
assert.Len(t, chunks, 2)
|
||||
assert.Equal(t, "First paragraph here.", chunks[0])
|
||||
assert.Equal(t, "Second paragraph here.", chunks[1])
|
||||
}
|
||||
|
||||
func TestChunk_SingleLargeParagraph(t *testing.T) {
|
||||
content := strings.Repeat("x", 100)
|
||||
chunks := Chunk(content, 50)
|
||||
assert.Len(t, chunks, 1)
|
||||
}
|
||||
|
||||
func TestChunk_NoChunkingWhenContentFits(t *testing.T) {
|
||||
content := "Short content."
|
||||
chunks := Chunk(content, 1000)
|
||||
assert.Len(t, chunks, 1)
|
||||
assert.Equal(t, "Short content.", chunks[0])
|
||||
}
|
||||
55
ingestion/internal/pipeline/parse.go
Normal file
55
ingestion/internal/pipeline/parse.go
Normal file
@@ -0,0 +1,55 @@
|
||||
// ingestion/internal/pipeline/parse.go
|
||||
package pipeline
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||
)
|
||||
|
||||
// ParsePages parses LLM output as a JSON array of {path, content} objects.
|
||||
// If the array is truncated mid-object (token limit), it salvages all complete objects.
|
||||
func ParsePages(output string) ([]wiki.Page, []string) {
|
||||
output = strings.TrimSpace(output)
|
||||
if output == "" {
|
||||
return nil, []string{"LLM returned empty output"}
|
||||
}
|
||||
|
||||
output = stripFences(output)
|
||||
|
||||
var pages []wiki.Page
|
||||
if err := json.Unmarshal([]byte(output), &pages); err == nil {
|
||||
return pages, nil
|
||||
}
|
||||
|
||||
// Truncation recovery: find last `}` that closes a complete object.
|
||||
idx := strings.LastIndex(output, "}")
|
||||
if idx < 0 {
|
||||
return nil, []string{"LLM output contained no complete JSON objects"}
|
||||
}
|
||||
|
||||
start := strings.Index(output, "[")
|
||||
if start < 0 {
|
||||
return nil, []string{"LLM output contained no JSON array opening bracket"}
|
||||
}
|
||||
|
||||
candidate := output[start:idx+1] + "]"
|
||||
if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
|
||||
return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
|
||||
}
|
||||
|
||||
return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
|
||||
}
|
||||
|
||||
func stripFences(s string) string {
|
||||
for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
|
||||
if strings.HasPrefix(s, prefix) {
|
||||
s = strings.TrimPrefix(s, prefix)
|
||||
s = strings.TrimSuffix(strings.TrimSpace(s), "```")
|
||||
return strings.TrimSpace(s)
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
46
ingestion/internal/pipeline/parse_test.go
Normal file
46
ingestion/internal/pipeline/parse_test.go
Normal file
@@ -0,0 +1,46 @@
|
||||
// ingestion/internal/pipeline/parse_test.go
|
||||
package pipeline
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestParsePages_ValidJSON(t *testing.T) {
|
||||
input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"# Bar"}]`
|
||||
pages, warnings := ParsePages(input)
|
||||
require.Len(t, pages, 2)
|
||||
assert.Empty(t, warnings)
|
||||
assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
|
||||
assert.Equal(t, "wiki/concepts/bar.md", pages[1].Path)
|
||||
}
|
||||
|
||||
func TestParsePages_StripsFences(t *testing.T) {
|
||||
input := "```json\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"# Foo\"}]\n```"
|
||||
pages, warnings := ParsePages(input)
|
||||
assert.Len(t, pages, 1)
|
||||
assert.Empty(t, warnings)
|
||||
}
|
||||
|
||||
func TestParsePages_TruncationRecovery(t *testing.T) {
|
||||
input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"trunc`
|
||||
pages, warnings := ParsePages(input)
|
||||
require.Len(t, pages, 1)
|
||||
assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
|
||||
assert.NotEmpty(t, warnings)
|
||||
}
|
||||
|
||||
func TestParsePages_EmptyInput(t *testing.T) {
|
||||
pages, warnings := ParsePages("")
|
||||
assert.Empty(t, pages)
|
||||
assert.NotEmpty(t, warnings)
|
||||
}
|
||||
|
||||
func TestParsePages_PlainFence(t *testing.T) {
|
||||
input := "```\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"ok\"}]\n```"
|
||||
pages, warnings := ParsePages(input)
|
||||
assert.Len(t, pages, 1)
|
||||
assert.Empty(t, warnings)
|
||||
}
|
||||
Reference in New Issue
Block a user