Files
Mathias Bergqvist 3e9a648115
All checks were successful
CI / Lint / Test / Vet (push) Successful in 11s
CI / Mirror to GitHub (push) Has been skipped
fix(pipeline): repair invalid JSON escape sequences from LLM output before parsing
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-23 22:04:27 +02:00

88 lines
3.0 KiB
Go

// ingestion/internal/pipeline/parse_test.go
package pipeline
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestParseRawPages_ValidJSON(t *testing.T) {
input := `[{"title":"Shape Up","type":"source","subtype":"book","domain":"product-strategy","content":"## Summary\n\nFoo."},{"title":"Betting","type":"concept","content":"## Definition\n\nA technique."}]`
pages, warnings := ParseRawPages(input)
require.Len(t, pages, 2)
assert.Empty(t, warnings)
assert.Equal(t, "Shape Up", pages[0].Title)
assert.Equal(t, "source", pages[0].Type)
assert.Equal(t, "book", pages[0].Subtype)
assert.Equal(t, "product-strategy", pages[0].Domain)
assert.Equal(t, "Betting", pages[1].Title)
assert.Equal(t, "concept", pages[1].Type)
assert.Empty(t, pages[1].Subtype)
}
func TestParseRawPages_StripsFences(t *testing.T) {
input := "```json\n[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"## Definition\\n\\nFoo.\"}]\n```"
pages, warnings := ParseRawPages(input)
require.Len(t, pages, 1)
assert.Empty(t, warnings)
assert.Equal(t, "Foo", pages[0].Title)
}
func TestParseRawPages_TruncationRecovery(t *testing.T) {
input := `[{"title":"Foo","type":"concept","content":"## Definition\n\nFoo."},{"title":"Bar","type":"concept","content":"trunc`
pages, warnings := ParseRawPages(input)
require.Len(t, pages, 1)
assert.Equal(t, "Foo", pages[0].Title)
assert.NotEmpty(t, warnings)
}
func TestParseRawPages_EmptyInput(t *testing.T) {
pages, warnings := ParseRawPages("")
assert.Empty(t, pages)
assert.NotEmpty(t, warnings)
}
func TestParseRawPages_PlainFence(t *testing.T) {
input := "```\n[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"ok\"}]\n```"
pages, warnings := ParseRawPages(input)
require.Len(t, pages, 1)
assert.Empty(t, warnings)
}
func TestParseRawPages_MissingTitle(t *testing.T) {
// Missing title — still parsed, Title is empty string
input := `[{"type":"concept","content":"## Definition\n\nFoo."}]`
pages, warnings := ParseRawPages(input)
require.Len(t, pages, 1)
assert.Empty(t, warnings)
assert.Empty(t, pages[0].Title)
}
func TestParseRawPages_InvalidEscapeRepaired(t *testing.T) {
// LLM copied markdown escaped list numbers (\.) into JSON — invalid escape
raw := "[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"Step 4\\. Do it.\"}]"
pages, warnings := ParseRawPages(raw)
require.Len(t, pages, 1)
assert.Equal(t, "Foo", pages[0].Title)
assert.Contains(t, pages[0].Content, `4\.`)
assert.NotEmpty(t, warnings) // repair warning
}
func TestRepairJSON_FixesInvalidEscapes(t *testing.T) {
cases := []struct {
in string
want string
}{
{`{"a":"foo\.bar"}`, `{"a":"foo\\.bar"}`},
{`{"a":"\\n is fine"}`, `{"a":"\\n is fine"}`}, // valid \n untouched
{`{"a":"\d+ items"}`, `{"a":"\\d+ items"}`},
{`{"a":"already \\ escaped"}`, `{"a":"already \\ escaped"}`}, // valid \\ untouched
}
for _, tc := range cases {
got := repairJSON(tc.in)
assert.Equal(t, tc.want, got, "input: %s", tc.in)
}
}