Files
hyperguild/ingestion/internal/pipeline/parse.go

56 lines
1.5 KiB
Go

// ingestion/internal/pipeline/parse.go
package pipeline
import (
"encoding/json"
"fmt"
"strings"
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
)
// ParsePages parses LLM output as a JSON array of {path, content} objects.
// If the array is truncated mid-object (token limit), it salvages all complete objects.
func ParsePages(output string) ([]wiki.Page, []string) {
output = strings.TrimSpace(output)
if output == "" {
return nil, []string{"LLM returned empty output"}
}
output = stripFences(output)
var pages []wiki.Page
if err := json.Unmarshal([]byte(output), &pages); err == nil {
return pages, nil
}
// Truncation recovery: find last `}` that closes a complete object.
idx := strings.LastIndex(output, "}")
if idx < 0 {
return nil, []string{"LLM output contained no complete JSON objects"}
}
start := strings.Index(output, "[")
if start < 0 {
return nil, []string{"LLM output contained no JSON array opening bracket"}
}
candidate := output[start:idx+1] + "]"
if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
}
return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
}
func stripFences(s string) string {
for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
if strings.HasPrefix(s, prefix) {
s = strings.TrimPrefix(s, prefix)
s = strings.TrimSuffix(strings.TrimSpace(s), "```")
return strings.TrimSpace(s)
}
}
return s
}