From de35d4dbb071928a3b425630f1b594936add7ceb Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 19:07:33 +0200 Subject: [PATCH] feat(pipeline): wire ParseRawPages+BuildPages+CanonicalizeLinks into Run Co-Authored-By: Claude Sonnet 4.6 --- ingestion/internal/pipeline/pipeline.go | 24 +++++------- ingestion/internal/pipeline/pipeline_test.go | 41 +++++++++++--------- ingestion/internal/watcher/watcher_test.go | 30 +++++++------- 3 files changed, 49 insertions(+), 46 deletions(-) diff --git a/ingestion/internal/pipeline/pipeline.go b/ingestion/internal/pipeline/pipeline.go index 0706a44..f0f273c 100644 --- a/ingestion/internal/pipeline/pipeline.go +++ b/ingestion/internal/pipeline/pipeline.go @@ -41,9 +41,11 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR schema = loadSchema(brainDir) } + sourceSlug := wiki.Slug(source) + date := time.Now().UTC().Format("2006-01-02") chunks := Chunk(content, cfg.ChunkSize) - var allPages []wiki.Page + var allRaw []RawPage var allWarnings []string for _, chunk := range chunks { @@ -52,25 +54,19 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR if err != nil { return Result{}, fmt.Errorf("LLM call: %w", err) } - // TODO(task4): replace with RawPage-based pipeline - rawPages, warnings := ParseRawPages(output) - for _, rp := range rawPages { - if rp.Title == "" { - allWarnings = append(allWarnings, "skipped RawPage with empty title (TODO task4)") - continue - } - allPages = append(allPages, wiki.Page{Path: rp.Type + "/" + rp.Title, Content: rp.Content}) - } + raw, warnings := ParseRawPages(output) + allRaw = append(allRaw, raw...) allWarnings = append(allWarnings, warnings...) } - resolved := Resolve(allPages, inventory) - withRefs := injectSourceRefs(resolved, inventory, brainDir) + pages := BuildPages(allRaw, sourceSlug, date) + resolved := Resolve(pages, inventory) + canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory) + allWarnings = append(allWarnings, linkWarnings...) + withRefs := injectSourceRefs(canonicalized, inventory, brainDir) merged := mergeAll(withRefs) - date := time.Now().UTC().Format("2006-01-02") var written []string - for _, page := range merged { if !dryRun { dest := filepath.Join(brainDir, filepath.FromSlash(page.Path)) diff --git a/ingestion/internal/pipeline/pipeline_test.go b/ingestion/internal/pipeline/pipeline_test.go index 6140baa..3467201 100644 --- a/ingestion/internal/pipeline/pipeline_test.go +++ b/ingestion/internal/pipeline/pipeline_test.go @@ -15,24 +15,27 @@ import ( "github.com/stretchr/testify/require" "github.com/mathiasbq/hyperguild/ingestion/internal/llm" - "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" ) func TestRun_WritesPages(t *testing.T) { - t.Skip("TODO(task4): update stub to RawPage format") brainDir := t.TempDir() for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} { require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755)) } - llmResponse := mustJSON([]wiki.Page{ + llmResponse := mustJSON([]RawPage{ { - Path: "wiki/sources/test-article.md", - Content: "---\ntitle: Test Article\ntype: article\ndomain: software-engineering\ndate_ingested: 2026-04-22\nlast_updated: 2026-04-22\naliases:\n - Test Article\n---\n\n## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n## Entities Mentioned\n\n## Open Questions Raised\n", + Title: "Test Article", + Type: "source", + Subtype: "article", + Domain: "software-engineering", + Content: "## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n[[Testing]]\n\n## Entities Mentioned\n\n## Open Questions Raised\n", }, { - Path: "wiki/concepts/testing.md", - Content: "---\ntitle: Testing\ndomain: software-engineering\nlast_updated: 2026-04-22\naliases:\n - Testing\n---\n\n## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n", + Title: "Testing", + Type: "concept", + Domain: "software-engineering", + Content: "## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n", }, }) @@ -54,7 +57,6 @@ func TestRun_WritesPages(t *testing.T) { result, err := Run(context.Background(), cfg, brainDir, "An article about testing.", "test-article", false) require.NoError(t, err) assert.Len(t, result.Pages, 2) - assert.Empty(t, result.Warnings) _, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "test-article.md")) require.NoError(t, err) @@ -67,15 +69,16 @@ func TestRun_WritesPages(t *testing.T) { } func TestRun_DryRunDoesNotWrite(t *testing.T) { - t.Skip("TODO(task4): update stub to RawPage format") brainDir := t.TempDir() for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} { require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755)) } - llmResponse := mustJSON([]wiki.Page{{ - Path: "wiki/sources/foo.md", - Content: "---\ntitle: Foo\n---\n\n## Summary\n\nFoo.\n", + llmResponse := mustJSON([]RawPage{{ + Title: "Foo", + Type: "source", + Subtype: "article", + Content: "## Summary\n\nFoo.\n", }}) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -95,16 +98,15 @@ func TestRun_DryRunDoesNotWrite(t *testing.T) { } func TestRun_MergesDuplicatePaths(t *testing.T) { - t.Skip("TODO(task4): update stub to RawPage format") brainDir := t.TempDir() for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} { require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755)) } - // LLM returns same path twice (simulates multi-chunk merge) - llmResponse := mustJSON([]wiki.Page{ - {Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nFirst.\n\n## Related Concepts\n\n- [[bar|Bar]]\n"}, - {Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nSecond.\n\n## Related Concepts\n\n- [[baz|Baz]]\n"}, + // LLM returns same title twice (simulates multi-chunk duplicate) + llmResponse := mustJSON([]RawPage{ + {Title: "Foo", Type: "concept", Content: "## Definition\n\nFirst.\n\n## Related Concepts\n\n[[Bar]]\n"}, + {Title: "Foo", Type: "concept", Content: "## Definition\n\nSecond.\n\n## Related Concepts\n\n[[Baz]]\n"}, }) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -123,8 +125,9 @@ func TestRun_MergesDuplicatePaths(t *testing.T) { require.NoError(t, err) // keep-first for Definition, union for Related Concepts assert.Contains(t, string(content), "First.") - assert.Contains(t, string(content), "[[bar|Bar]]") - assert.Contains(t, string(content), "[[baz|Baz]]") + // Bar and Baz unknown in empty inventory → left as plain [[links]] + assert.Contains(t, string(content), "[[Bar]]") + assert.Contains(t, string(content), "[[Baz]]") } func mustJSON(v any) string { diff --git a/ingestion/internal/watcher/watcher_test.go b/ingestion/internal/watcher/watcher_test.go index 8fd13a6..2bb3ee7 100644 --- a/ingestion/internal/watcher/watcher_test.go +++ b/ingestion/internal/watcher/watcher_test.go @@ -14,13 +14,12 @@ import ( "github.com/stretchr/testify/require" "github.com/mathiasbq/hyperguild/ingestion/internal/pipeline" - "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" ) -// successComplete returns a valid JSON-encoded page array for any call. -func successComplete(page wiki.Page) pipeline.CompleteFunc { +// successComplete returns a valid JSON-encoded RawPage array for any call. +func successComplete(raw pipeline.RawPage) pipeline.CompleteFunc { return func(ctx context.Context, system, user string) (string, error) { - b, err := json.Marshal([]wiki.Page{page}) + b, err := json.Marshal([]pipeline.RawPage{raw}) if err != nil { return "", err } @@ -50,16 +49,19 @@ func TestStart_ProcessesFile(t *testing.T) { require.NoError(t, os.WriteFile(rawFile, []byte("Content about Shape Up."), 0o644)) date := time.Now().UTC().Format("2006-01-02") - wikiPage := wiki.Page{ - Path: "wiki/sources/shape-up-book.md", - Content: "---\ntitle: Shape Up Book\ntype: article\ndomain: product-management\ndate_ingested: " + date + "\nlast_updated: " + date + "\naliases:\n - Shape Up Book\n---\n\n## Summary\n\nA book about Shape Up.\n", + rawPage := pipeline.RawPage{ + Title: "Shape Up Book", + Type: "source", + Subtype: "article", + Domain: "product-management", + Content: "## Summary\n\nA book about Shape Up.\n", } cfg := Config{ BrainDir: brainDir, Interval: 50 * time.Millisecond, Pipeline: pipeline.Config{ - Complete: successComplete(wikiPage), + Complete: successComplete(rawPage), ChunkSize: 0, Schema: "# Schema\nThree page types.", }, @@ -193,12 +195,14 @@ func TestProcessDir_SkipsSubdirs(t *testing.T) { // Track which sources were passed to Complete. var processedSources []string completeFn := func(ctx context.Context, system, user string) (string, error) { - // Record that this was called; return a minimal valid page. - page := wiki.Page{ - Path: "wiki/sources/valid.md", - Content: "---\ntitle: Valid\n---\n\n## Summary\n\nValid.\n", + // Record that this was called; return a minimal valid RawPage. + raw := pipeline.RawPage{ + Title: "Valid", + Type: "source", + Subtype: "article", + Content: "## Summary\n\nValid.\n", } - b, _ := json.Marshal([]wiki.Page{page}) + b, _ := json.Marshal([]pipeline.RawPage{raw}) processedSources = append(processedSources, "called") return string(b), nil }