feat(pipeline): replace ParsePages with ParseRawPages + RawPage type
Strips slug authority from the LLM. The new RawPage type carries only
{title, type, subtype, domain, content} — no paths or frontmatter.
Pipeline will derive slugs deterministically (Task 4).
pipeline.go gets a temporary bridge stub (TODO task4) to keep the
package compiling between tasks.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,13 +5,21 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// ParsePages parses LLM output as a JSON array of {path, content} objects.
|
// RawPage is the LLM's output format — minimal structured data with no path or frontmatter.
|
||||||
|
// The pipeline derives slugs, paths, and frontmatter from these fields.
|
||||||
|
type RawPage struct {
|
||||||
|
Title string `json:"title"`
|
||||||
|
Type string `json:"type"` // "source" | "concept" | "entity"
|
||||||
|
Subtype string `json:"subtype"` // entity: person|company|tool|model|framework|technology; source: article|pdf|book|video|note|project
|
||||||
|
Domain string `json:"domain"`
|
||||||
|
Content string `json:"content"` // Markdown body only — no frontmatter
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseRawPages parses LLM output as a JSON array of RawPage objects.
|
||||||
// If the array is truncated mid-object (token limit), it salvages all complete objects.
|
// If the array is truncated mid-object (token limit), it salvages all complete objects.
|
||||||
func ParsePages(output string) ([]wiki.Page, []string) {
|
func ParseRawPages(output string) ([]RawPage, []string) {
|
||||||
output = strings.TrimSpace(output)
|
output = strings.TrimSpace(output)
|
||||||
if output == "" {
|
if output == "" {
|
||||||
return nil, []string{"LLM returned empty output"}
|
return nil, []string{"LLM returned empty output"}
|
||||||
@@ -19,7 +27,7 @@ func ParsePages(output string) ([]wiki.Page, []string) {
|
|||||||
|
|
||||||
output = stripFences(output)
|
output = stripFences(output)
|
||||||
|
|
||||||
var pages []wiki.Page
|
var pages []RawPage
|
||||||
if err := json.Unmarshal([]byte(output), &pages); err == nil {
|
if err := json.Unmarshal([]byte(output), &pages); err == nil {
|
||||||
return pages, nil
|
return pages, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,39 +8,54 @@ import (
|
|||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestParsePages_ValidJSON(t *testing.T) {
|
func TestParseRawPages_ValidJSON(t *testing.T) {
|
||||||
input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"# Bar"}]`
|
input := `[{"title":"Shape Up","type":"source","subtype":"book","domain":"product-strategy","content":"## Summary\n\nFoo."},{"title":"Betting","type":"concept","content":"## Definition\n\nA technique."}]`
|
||||||
pages, warnings := ParsePages(input)
|
pages, warnings := ParseRawPages(input)
|
||||||
require.Len(t, pages, 2)
|
require.Len(t, pages, 2)
|
||||||
assert.Empty(t, warnings)
|
assert.Empty(t, warnings)
|
||||||
assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
|
assert.Equal(t, "Shape Up", pages[0].Title)
|
||||||
assert.Equal(t, "wiki/concepts/bar.md", pages[1].Path)
|
assert.Equal(t, "source", pages[0].Type)
|
||||||
|
assert.Equal(t, "book", pages[0].Subtype)
|
||||||
|
assert.Equal(t, "product-strategy", pages[0].Domain)
|
||||||
|
assert.Equal(t, "Betting", pages[1].Title)
|
||||||
|
assert.Equal(t, "concept", pages[1].Type)
|
||||||
|
assert.Empty(t, pages[1].Subtype)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParsePages_StripsFences(t *testing.T) {
|
func TestParseRawPages_StripsFences(t *testing.T) {
|
||||||
input := "```json\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"# Foo\"}]\n```"
|
input := "```json\n[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"## Definition\\n\\nFoo.\"}]\n```"
|
||||||
pages, warnings := ParsePages(input)
|
pages, warnings := ParseRawPages(input)
|
||||||
assert.Len(t, pages, 1)
|
|
||||||
assert.Empty(t, warnings)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParsePages_TruncationRecovery(t *testing.T) {
|
|
||||||
input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"trunc`
|
|
||||||
pages, warnings := ParsePages(input)
|
|
||||||
require.Len(t, pages, 1)
|
require.Len(t, pages, 1)
|
||||||
assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
|
assert.Empty(t, warnings)
|
||||||
|
assert.Equal(t, "Foo", pages[0].Title)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseRawPages_TruncationRecovery(t *testing.T) {
|
||||||
|
input := `[{"title":"Foo","type":"concept","content":"## Definition\n\nFoo."},{"title":"Bar","type":"concept","content":"trunc`
|
||||||
|
pages, warnings := ParseRawPages(input)
|
||||||
|
require.Len(t, pages, 1)
|
||||||
|
assert.Equal(t, "Foo", pages[0].Title)
|
||||||
assert.NotEmpty(t, warnings)
|
assert.NotEmpty(t, warnings)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParsePages_EmptyInput(t *testing.T) {
|
func TestParseRawPages_EmptyInput(t *testing.T) {
|
||||||
pages, warnings := ParsePages("")
|
pages, warnings := ParseRawPages("")
|
||||||
assert.Empty(t, pages)
|
assert.Empty(t, pages)
|
||||||
assert.NotEmpty(t, warnings)
|
assert.NotEmpty(t, warnings)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParsePages_PlainFence(t *testing.T) {
|
func TestParseRawPages_PlainFence(t *testing.T) {
|
||||||
input := "```\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"ok\"}]\n```"
|
input := "```\n[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"ok\"}]\n```"
|
||||||
pages, warnings := ParsePages(input)
|
pages, warnings := ParseRawPages(input)
|
||||||
assert.Len(t, pages, 1)
|
require.Len(t, pages, 1)
|
||||||
assert.Empty(t, warnings)
|
assert.Empty(t, warnings)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseRawPages_MissingTitle(t *testing.T) {
|
||||||
|
// Missing title — still parsed, Title is empty string
|
||||||
|
input := `[{"type":"concept","content":"## Definition\n\nFoo."}]`
|
||||||
|
pages, warnings := ParseRawPages(input)
|
||||||
|
require.Len(t, pages, 1)
|
||||||
|
assert.Empty(t, warnings)
|
||||||
|
assert.Empty(t, pages[0].Title)
|
||||||
|
}
|
||||||
|
|||||||
@@ -52,8 +52,11 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Result{}, fmt.Errorf("LLM call: %w", err)
|
return Result{}, fmt.Errorf("LLM call: %w", err)
|
||||||
}
|
}
|
||||||
pages, warnings := ParsePages(output)
|
// TODO(task4): replace with RawPage-based pipeline
|
||||||
allPages = append(allPages, pages...)
|
rawPages, warnings := ParseRawPages(output)
|
||||||
|
for _, rp := range rawPages {
|
||||||
|
allPages = append(allPages, wiki.Page{Path: rp.Type + "/" + rp.Title, Content: rp.Content})
|
||||||
|
}
|
||||||
allWarnings = append(allWarnings, warnings...)
|
allWarnings = append(allWarnings, warnings...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user