fix(pipeline): repair invalid JSON escape sequences from LLM output before parsing
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -18,7 +18,8 @@ type RawPage struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ParseRawPages parses LLM output as a JSON array of RawPage objects.
|
// ParseRawPages parses LLM output as a JSON array of RawPage objects.
|
||||||
// If the array is truncated mid-object (token limit), it salvages all complete objects.
|
// If the output contains invalid JSON escape sequences (e.g. \. from Markdown),
|
||||||
|
// it attempts repair before falling back to truncation recovery.
|
||||||
func ParseRawPages(output string) ([]RawPage, []string) {
|
func ParseRawPages(output string) ([]RawPage, []string) {
|
||||||
output = strings.TrimSpace(output)
|
output = strings.TrimSpace(output)
|
||||||
if output == "" {
|
if output == "" {
|
||||||
@@ -27,23 +28,30 @@ func ParseRawPages(output string) ([]RawPage, []string) {
|
|||||||
|
|
||||||
output = stripFences(output)
|
output = stripFences(output)
|
||||||
|
|
||||||
|
// Fast path: valid JSON.
|
||||||
var pages []RawPage
|
var pages []RawPage
|
||||||
if err := json.Unmarshal([]byte(output), &pages); err == nil {
|
if err := json.Unmarshal([]byte(output), &pages); err == nil {
|
||||||
return pages, nil
|
return pages, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Repair pass: fix invalid escape sequences (e.g. \. \d from Markdown content).
|
||||||
|
repaired := repairJSON(output)
|
||||||
|
if err := json.Unmarshal([]byte(repaired), &pages); err == nil {
|
||||||
|
return pages, []string{"repaired invalid JSON escape sequences in LLM output"}
|
||||||
|
}
|
||||||
|
|
||||||
// Truncation recovery: find last `}` that closes a complete object.
|
// Truncation recovery: find last `}` that closes a complete object.
|
||||||
idx := strings.LastIndex(output, "}")
|
idx := strings.LastIndex(repaired, "}")
|
||||||
if idx < 0 {
|
if idx < 0 {
|
||||||
return nil, []string{"LLM output contained no complete JSON objects"}
|
return nil, []string{"LLM output contained no complete JSON objects"}
|
||||||
}
|
}
|
||||||
|
|
||||||
start := strings.Index(output, "[")
|
start := strings.Index(repaired, "[")
|
||||||
if start < 0 {
|
if start < 0 {
|
||||||
return nil, []string{"LLM output contained no JSON array opening bracket"}
|
return nil, []string{"LLM output contained no JSON array opening bracket"}
|
||||||
}
|
}
|
||||||
|
|
||||||
candidate := output[start:idx+1] + "]"
|
candidate := repaired[start:idx+1] + "]"
|
||||||
if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
|
if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
|
||||||
return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
|
return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
|
||||||
}
|
}
|
||||||
@@ -51,6 +59,45 @@ func ParseRawPages(output string) ([]RawPage, []string) {
|
|||||||
return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
|
return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// repairJSON replaces invalid JSON escape sequences (e.g. \. \d \p) with
|
||||||
|
// a properly escaped backslash followed by the same character.
|
||||||
|
// It iterates byte-by-byte to correctly skip already-valid escape sequences
|
||||||
|
// (including \\) without requiring lookbehind support.
|
||||||
|
func repairJSON(s string) string {
|
||||||
|
var b strings.Builder
|
||||||
|
b.Grow(len(s))
|
||||||
|
i := 0
|
||||||
|
for i < len(s) {
|
||||||
|
if s[i] != '\\' {
|
||||||
|
b.WriteByte(s[i])
|
||||||
|
i++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// We have a backslash. Peek at the next character.
|
||||||
|
if i+1 >= len(s) {
|
||||||
|
// Trailing backslash — emit as-is.
|
||||||
|
b.WriteByte(s[i])
|
||||||
|
i++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
next := s[i+1]
|
||||||
|
switch next {
|
||||||
|
case '"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u':
|
||||||
|
// Valid JSON escape sequence — emit both characters as-is.
|
||||||
|
b.WriteByte(s[i])
|
||||||
|
b.WriteByte(next)
|
||||||
|
i += 2
|
||||||
|
default:
|
||||||
|
// Invalid escape — double the backslash.
|
||||||
|
b.WriteByte('\\')
|
||||||
|
b.WriteByte('\\')
|
||||||
|
b.WriteByte(next)
|
||||||
|
i += 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
func stripFences(s string) string {
|
func stripFences(s string) string {
|
||||||
for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
|
for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
|
||||||
if strings.HasPrefix(s, prefix) {
|
if strings.HasPrefix(s, prefix) {
|
||||||
|
|||||||
@@ -59,3 +59,29 @@ func TestParseRawPages_MissingTitle(t *testing.T) {
|
|||||||
assert.Empty(t, warnings)
|
assert.Empty(t, warnings)
|
||||||
assert.Empty(t, pages[0].Title)
|
assert.Empty(t, pages[0].Title)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseRawPages_InvalidEscapeRepaired(t *testing.T) {
|
||||||
|
// LLM copied markdown escaped list numbers (\.) into JSON — invalid escape
|
||||||
|
raw := "[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"Step 4\\. Do it.\"}]"
|
||||||
|
pages, warnings := ParseRawPages(raw)
|
||||||
|
require.Len(t, pages, 1)
|
||||||
|
assert.Equal(t, "Foo", pages[0].Title)
|
||||||
|
assert.Contains(t, pages[0].Content, `4\.`)
|
||||||
|
assert.NotEmpty(t, warnings) // repair warning
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRepairJSON_FixesInvalidEscapes(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
in string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{`{"a":"foo\.bar"}`, `{"a":"foo\\.bar"}`},
|
||||||
|
{`{"a":"\\n is fine"}`, `{"a":"\\n is fine"}`}, // valid \n untouched
|
||||||
|
{`{"a":"\d+ items"}`, `{"a":"\\d+ items"}`},
|
||||||
|
{`{"a":"already \\ escaped"}`, `{"a":"already \\ escaped"}`}, // valid \\ untouched
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
got := repairJSON(tc.in)
|
||||||
|
assert.Equal(t, tc.want, got, "input: %s", tc.in)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user