fix(pipeline): repair invalid JSON escape sequences from LLM output before parsing
All checks were successful
CI / Lint / Test / Vet (push) Successful in 11s
CI / Mirror to GitHub (push) Has been skipped

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mathias Bergqvist
2026-04-23 22:04:27 +02:00
parent 923a665365
commit 3e9a648115
2 changed files with 77 additions and 4 deletions

View File

@@ -59,3 +59,29 @@ func TestParseRawPages_MissingTitle(t *testing.T) {
assert.Empty(t, warnings)
assert.Empty(t, pages[0].Title)
}
func TestParseRawPages_InvalidEscapeRepaired(t *testing.T) {
// LLM copied markdown escaped list numbers (\.) into JSON — invalid escape
raw := "[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"Step 4\\. Do it.\"}]"
pages, warnings := ParseRawPages(raw)
require.Len(t, pages, 1)
assert.Equal(t, "Foo", pages[0].Title)
assert.Contains(t, pages[0].Content, `4\.`)
assert.NotEmpty(t, warnings) // repair warning
}
func TestRepairJSON_FixesInvalidEscapes(t *testing.T) {
cases := []struct {
in string
want string
}{
{`{"a":"foo\.bar"}`, `{"a":"foo\\.bar"}`},
{`{"a":"\\n is fine"}`, `{"a":"\\n is fine"}`}, // valid \n untouched
{`{"a":"\d+ items"}`, `{"a":"\\d+ items"}`},
{`{"a":"already \\ escaped"}`, `{"a":"already \\ escaped"}`}, // valid \\ untouched
}
for _, tc := range cases {
got := repairJSON(tc.in)
assert.Equal(t, tc.want, got, "input: %s", tc.in)
}
}