From 26b5636b43d24c6782f835c83bdb3f762debba01 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 18:41:33 +0200 Subject: [PATCH] feat(pipeline): replace ParsePages with ParseRawPages + RawPage type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strips slug authority from the LLM. The new RawPage type carries only {title, type, subtype, domain, content} — no paths or frontmatter. Pipeline will derive slugs deterministically (Task 4). pipeline.go gets a temporary bridge stub (TODO task4) to keep the package compiling between tasks. Co-Authored-By: Claude Sonnet 4.6 --- ingestion/internal/pipeline/parse.go | 18 +++++-- ingestion/internal/pipeline/parse_test.go | 59 ++++++++++++++--------- ingestion/internal/pipeline/pipeline.go | 7 ++- 3 files changed, 55 insertions(+), 29 deletions(-) diff --git a/ingestion/internal/pipeline/parse.go b/ingestion/internal/pipeline/parse.go index ac9a38d..8a28191 100644 --- a/ingestion/internal/pipeline/parse.go +++ b/ingestion/internal/pipeline/parse.go @@ -5,13 +5,21 @@ import ( "encoding/json" "fmt" "strings" - - "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" ) -// ParsePages parses LLM output as a JSON array of {path, content} objects. +// RawPage is the LLM's output format — minimal structured data with no path or frontmatter. +// The pipeline derives slugs, paths, and frontmatter from these fields. +type RawPage struct { + Title string `json:"title"` + Type string `json:"type"` // "source" | "concept" | "entity" + Subtype string `json:"subtype"` // entity: person|company|tool|model|framework|technology; source: article|pdf|book|video|note|project + Domain string `json:"domain"` + Content string `json:"content"` // Markdown body only — no frontmatter +} + +// ParseRawPages parses LLM output as a JSON array of RawPage objects. // If the array is truncated mid-object (token limit), it salvages all complete objects. -func ParsePages(output string) ([]wiki.Page, []string) { +func ParseRawPages(output string) ([]RawPage, []string) { output = strings.TrimSpace(output) if output == "" { return nil, []string{"LLM returned empty output"} @@ -19,7 +27,7 @@ func ParsePages(output string) ([]wiki.Page, []string) { output = stripFences(output) - var pages []wiki.Page + var pages []RawPage if err := json.Unmarshal([]byte(output), &pages); err == nil { return pages, nil } diff --git a/ingestion/internal/pipeline/parse_test.go b/ingestion/internal/pipeline/parse_test.go index e84464d..46d05f1 100644 --- a/ingestion/internal/pipeline/parse_test.go +++ b/ingestion/internal/pipeline/parse_test.go @@ -8,39 +8,54 @@ import ( "github.com/stretchr/testify/require" ) -func TestParsePages_ValidJSON(t *testing.T) { - input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"# Bar"}]` - pages, warnings := ParsePages(input) +func TestParseRawPages_ValidJSON(t *testing.T) { + input := `[{"title":"Shape Up","type":"source","subtype":"book","domain":"product-strategy","content":"## Summary\n\nFoo."},{"title":"Betting","type":"concept","content":"## Definition\n\nA technique."}]` + pages, warnings := ParseRawPages(input) require.Len(t, pages, 2) assert.Empty(t, warnings) - assert.Equal(t, "wiki/sources/foo.md", pages[0].Path) - assert.Equal(t, "wiki/concepts/bar.md", pages[1].Path) + assert.Equal(t, "Shape Up", pages[0].Title) + assert.Equal(t, "source", pages[0].Type) + assert.Equal(t, "book", pages[0].Subtype) + assert.Equal(t, "product-strategy", pages[0].Domain) + assert.Equal(t, "Betting", pages[1].Title) + assert.Equal(t, "concept", pages[1].Type) + assert.Empty(t, pages[1].Subtype) } -func TestParsePages_StripsFences(t *testing.T) { - input := "```json\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"# Foo\"}]\n```" - pages, warnings := ParsePages(input) - assert.Len(t, pages, 1) - assert.Empty(t, warnings) -} - -func TestParsePages_TruncationRecovery(t *testing.T) { - input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"trunc` - pages, warnings := ParsePages(input) +func TestParseRawPages_StripsFences(t *testing.T) { + input := "```json\n[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"## Definition\\n\\nFoo.\"}]\n```" + pages, warnings := ParseRawPages(input) require.Len(t, pages, 1) - assert.Equal(t, "wiki/sources/foo.md", pages[0].Path) + assert.Empty(t, warnings) + assert.Equal(t, "Foo", pages[0].Title) +} + +func TestParseRawPages_TruncationRecovery(t *testing.T) { + input := `[{"title":"Foo","type":"concept","content":"## Definition\n\nFoo."},{"title":"Bar","type":"concept","content":"trunc` + pages, warnings := ParseRawPages(input) + require.Len(t, pages, 1) + assert.Equal(t, "Foo", pages[0].Title) assert.NotEmpty(t, warnings) } -func TestParsePages_EmptyInput(t *testing.T) { - pages, warnings := ParsePages("") +func TestParseRawPages_EmptyInput(t *testing.T) { + pages, warnings := ParseRawPages("") assert.Empty(t, pages) assert.NotEmpty(t, warnings) } -func TestParsePages_PlainFence(t *testing.T) { - input := "```\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"ok\"}]\n```" - pages, warnings := ParsePages(input) - assert.Len(t, pages, 1) +func TestParseRawPages_PlainFence(t *testing.T) { + input := "```\n[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"ok\"}]\n```" + pages, warnings := ParseRawPages(input) + require.Len(t, pages, 1) assert.Empty(t, warnings) } + +func TestParseRawPages_MissingTitle(t *testing.T) { + // Missing title — still parsed, Title is empty string + input := `[{"type":"concept","content":"## Definition\n\nFoo."}]` + pages, warnings := ParseRawPages(input) + require.Len(t, pages, 1) + assert.Empty(t, warnings) + assert.Empty(t, pages[0].Title) +} diff --git a/ingestion/internal/pipeline/pipeline.go b/ingestion/internal/pipeline/pipeline.go index 1163463..cb5d45a 100644 --- a/ingestion/internal/pipeline/pipeline.go +++ b/ingestion/internal/pipeline/pipeline.go @@ -52,8 +52,11 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR if err != nil { return Result{}, fmt.Errorf("LLM call: %w", err) } - pages, warnings := ParsePages(output) - allPages = append(allPages, pages...) + // TODO(task4): replace with RawPage-based pipeline + rawPages, warnings := ParseRawPages(output) + for _, rp := range rawPages { + allPages = append(allPages, wiki.Page{Path: rp.Type + "/" + rp.Title, Content: rp.Content}) + } allWarnings = append(allWarnings, warnings...) }