// ingestion/internal/pipeline/parse.go package pipeline import ( "encoding/json" "fmt" "strings" "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" ) // ParsePages parses LLM output as a JSON array of {path, content} objects. // If the array is truncated mid-object (token limit), it salvages all complete objects. func ParsePages(output string) ([]wiki.Page, []string) { output = strings.TrimSpace(output) if output == "" { return nil, []string{"LLM returned empty output"} } output = stripFences(output) var pages []wiki.Page if err := json.Unmarshal([]byte(output), &pages); err == nil { return pages, nil } // Truncation recovery: find last `}` that closes a complete object. idx := strings.LastIndex(output, "}") if idx < 0 { return nil, []string{"LLM output contained no complete JSON objects"} } start := strings.Index(output, "[") if start < 0 { return nil, []string{"LLM output contained no JSON array opening bracket"} } candidate := output[start:idx+1] + "]" if err := json.Unmarshal([]byte(candidate), &pages); err != nil { return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)} } return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))} } func stripFences(s string) string { for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} { if strings.HasPrefix(s, prefix) { s = strings.TrimPrefix(s, prefix) s = strings.TrimSuffix(strings.TrimSpace(s), "```") return strings.TrimSpace(s) } } return s }