// ingestion/internal/pipeline/parse.go package pipeline import ( "encoding/json" "fmt" "strings" ) // RawPage is the LLM's output format — minimal structured data with no path or frontmatter. // The pipeline derives slugs, paths, and frontmatter from these fields. type RawPage struct { Title string `json:"title"` Type string `json:"type"` // "source" | "concept" | "entity" Subtype string `json:"subtype"` // entity: person|company|tool|model|framework|technology; source: article|pdf|book|video|note|project Domain string `json:"domain"` Content string `json:"content"` // Markdown body only — no frontmatter } // ParseRawPages parses LLM output as a JSON array of RawPage objects. // If the array is truncated mid-object (token limit), it salvages all complete objects. func ParseRawPages(output string) ([]RawPage, []string) { output = strings.TrimSpace(output) if output == "" { return nil, []string{"LLM returned empty output"} } output = stripFences(output) var pages []RawPage if err := json.Unmarshal([]byte(output), &pages); err == nil { return pages, nil } // Truncation recovery: find last `}` that closes a complete object. idx := strings.LastIndex(output, "}") if idx < 0 { return nil, []string{"LLM output contained no complete JSON objects"} } start := strings.Index(output, "[") if start < 0 { return nil, []string{"LLM output contained no JSON array opening bracket"} } candidate := output[start:idx+1] + "]" if err := json.Unmarshal([]byte(candidate), &pages); err != nil { return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)} } return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))} } func stripFences(s string) string { for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} { if strings.HasPrefix(s, prefix) { s = strings.TrimPrefix(s, prefix) s = strings.TrimSuffix(strings.TrimSpace(s), "```") return strings.TrimSpace(s) } } return s }