From 3e9a6481155e86a582a0d088241a7b4b97926989 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 22:04:27 +0200 Subject: [PATCH] fix(pipeline): repair invalid JSON escape sequences from LLM output before parsing Co-Authored-By: Claude Sonnet 4.6 --- ingestion/internal/pipeline/parse.go | 55 +++++++++++++++++++++-- ingestion/internal/pipeline/parse_test.go | 26 +++++++++++ 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/ingestion/internal/pipeline/parse.go b/ingestion/internal/pipeline/parse.go index 8a28191..ee4a23f 100644 --- a/ingestion/internal/pipeline/parse.go +++ b/ingestion/internal/pipeline/parse.go @@ -18,7 +18,8 @@ type RawPage struct { } // ParseRawPages parses LLM output as a JSON array of RawPage objects. -// If the array is truncated mid-object (token limit), it salvages all complete objects. +// If the output contains invalid JSON escape sequences (e.g. \. from Markdown), +// it attempts repair before falling back to truncation recovery. func ParseRawPages(output string) ([]RawPage, []string) { output = strings.TrimSpace(output) if output == "" { @@ -27,23 +28,30 @@ func ParseRawPages(output string) ([]RawPage, []string) { output = stripFences(output) + // Fast path: valid JSON. var pages []RawPage if err := json.Unmarshal([]byte(output), &pages); err == nil { return pages, nil } + // Repair pass: fix invalid escape sequences (e.g. \. \d from Markdown content). + repaired := repairJSON(output) + if err := json.Unmarshal([]byte(repaired), &pages); err == nil { + return pages, []string{"repaired invalid JSON escape sequences in LLM output"} + } + // Truncation recovery: find last `}` that closes a complete object. - idx := strings.LastIndex(output, "}") + idx := strings.LastIndex(repaired, "}") if idx < 0 { return nil, []string{"LLM output contained no complete JSON objects"} } - start := strings.Index(output, "[") + start := strings.Index(repaired, "[") if start < 0 { return nil, []string{"LLM output contained no JSON array opening bracket"} } - candidate := output[start:idx+1] + "]" + candidate := repaired[start:idx+1] + "]" if err := json.Unmarshal([]byte(candidate), &pages); err != nil { return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)} } @@ -51,6 +59,45 @@ func ParseRawPages(output string) ([]RawPage, []string) { return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))} } +// repairJSON replaces invalid JSON escape sequences (e.g. \. \d \p) with +// a properly escaped backslash followed by the same character. +// It iterates byte-by-byte to correctly skip already-valid escape sequences +// (including \\) without requiring lookbehind support. +func repairJSON(s string) string { + var b strings.Builder + b.Grow(len(s)) + i := 0 + for i < len(s) { + if s[i] != '\\' { + b.WriteByte(s[i]) + i++ + continue + } + // We have a backslash. Peek at the next character. + if i+1 >= len(s) { + // Trailing backslash — emit as-is. + b.WriteByte(s[i]) + i++ + continue + } + next := s[i+1] + switch next { + case '"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u': + // Valid JSON escape sequence — emit both characters as-is. + b.WriteByte(s[i]) + b.WriteByte(next) + i += 2 + default: + // Invalid escape — double the backslash. + b.WriteByte('\\') + b.WriteByte('\\') + b.WriteByte(next) + i += 2 + } + } + return b.String() +} + func stripFences(s string) string { for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} { if strings.HasPrefix(s, prefix) { diff --git a/ingestion/internal/pipeline/parse_test.go b/ingestion/internal/pipeline/parse_test.go index 46d05f1..3e6e025 100644 --- a/ingestion/internal/pipeline/parse_test.go +++ b/ingestion/internal/pipeline/parse_test.go @@ -59,3 +59,29 @@ func TestParseRawPages_MissingTitle(t *testing.T) { assert.Empty(t, warnings) assert.Empty(t, pages[0].Title) } + +func TestParseRawPages_InvalidEscapeRepaired(t *testing.T) { + // LLM copied markdown escaped list numbers (\.) into JSON — invalid escape + raw := "[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"Step 4\\. Do it.\"}]" + pages, warnings := ParseRawPages(raw) + require.Len(t, pages, 1) + assert.Equal(t, "Foo", pages[0].Title) + assert.Contains(t, pages[0].Content, `4\.`) + assert.NotEmpty(t, warnings) // repair warning +} + +func TestRepairJSON_FixesInvalidEscapes(t *testing.T) { + cases := []struct { + in string + want string + }{ + {`{"a":"foo\.bar"}`, `{"a":"foo\\.bar"}`}, + {`{"a":"\\n is fine"}`, `{"a":"\\n is fine"}`}, // valid \n untouched + {`{"a":"\d+ items"}`, `{"a":"\\d+ items"}`}, + {`{"a":"already \\ escaped"}`, `{"a":"already \\ escaped"}`}, // valid \\ untouched + } + for _, tc := range cases { + got := repairJSON(tc.in) + assert.Equal(t, tc.want, got, "input: %s", tc.in) + } +}