fix(pipeline): skip RawPages with empty title in BuildPages instead of producing broken paths
All checks were successful
CI / Lint / Test / Vet (push) Successful in 9s
CI / Mirror to GitHub (push) Has been skipped

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mathias Bergqvist
2026-04-23 19:55:37 +02:00
parent 537aebc302
commit 923a665365
4 changed files with 46 additions and 16 deletions

View File

@@ -20,9 +20,9 @@ import (
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline" "github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
) )
// stubComplete returns a fixed JSON page so tests never call a real LLM. // stubComplete returns a fixed JSON RawPage so tests never call a real LLM.
func stubComplete(_ context.Context, _, _ string) (string, error) { func stubComplete(_ context.Context, _, _ string) (string, error) {
return `[{"path":"wiki/sources/test-source.md","content":"# Test Source\n\nSome content here.\n"}]`, nil return `[{"title":"Test Source","type":"source","subtype":"article","content":"## Summary\n\nSome content here.\n"}]`, nil
} }
func stubPipelineCfg() pipeline.Config { func stubPipelineCfg() pipeline.Config {

View File

@@ -10,13 +10,27 @@ import (
// BuildPages converts RawPages from the LLM into wiki.Pages with computed slugs, // BuildPages converts RawPages from the LLM into wiki.Pages with computed slugs,
// paths, and YAML frontmatter. sourceSlug is the slug of the source being ingested // paths, and YAML frontmatter. sourceSlug is the slug of the source being ingested
// (derived from the filename, not the LLM title). // (derived from the filename, not the LLM title). Pages whose title resolves to an
func BuildPages(rawPages []RawPage, sourceSlug, date string) []wiki.Page { // empty slug are skipped and returned as warnings instead.
func BuildPages(rawPages []RawPage, sourceSlug, date string) ([]wiki.Page, []string) {
out := make([]wiki.Page, 0, len(rawPages)) out := make([]wiki.Page, 0, len(rawPages))
var warnings []string
for _, rp := range rawPages { for _, rp := range rawPages {
slug := computeSlug(rp, sourceSlug)
if slug == "" {
warnings = append(warnings, fmt.Sprintf("skipped page with empty title (type: %s)", rp.Type))
continue
}
out = append(out, buildPage(rp, sourceSlug, date)) out = append(out, buildPage(rp, sourceSlug, date))
} }
return out return out, warnings
}
func computeSlug(rp RawPage, sourceSlug string) string {
if rp.Type == "source" {
return sourceSlug
}
return wiki.Slug(rp.Title)
} }
func buildPage(rp RawPage, sourceSlug, date string) wiki.Page { func buildPage(rp RawPage, sourceSlug, date string) wiki.Page {

View File

@@ -19,8 +19,9 @@ func TestBuildPages_SourcePage(t *testing.T) {
Content: "## Summary\n\nA book about shaping product work.\n", Content: "## Summary\n\nA book about shaping product work.\n",
}, },
} }
pages := BuildPages(raw, "shape-up", "2026-04-23") pages, warnings := BuildPages(raw, "shape-up", "2026-04-23")
require.Len(t, pages, 1) require.Len(t, pages, 1)
assert.Empty(t, warnings)
p := pages[0] p := pages[0]
assert.Equal(t, "wiki/sources/shape-up.md", p.Path) assert.Equal(t, "wiki/sources/shape-up.md", p.Path)
@@ -43,8 +44,9 @@ func TestBuildPages_ConceptPage(t *testing.T) {
Content: "## Definition\n\nA resource allocation technique.\n", Content: "## Definition\n\nA resource allocation technique.\n",
}, },
} }
pages := BuildPages(raw, "shape-up", "2026-04-23") pages, warnings := BuildPages(raw, "shape-up", "2026-04-23")
require.Len(t, pages, 1) require.Len(t, pages, 1)
assert.Empty(t, warnings)
p := pages[0] p := pages[0]
assert.Equal(t, "wiki/concepts/betting.md", p.Path) assert.Equal(t, "wiki/concepts/betting.md", p.Path)
@@ -66,8 +68,9 @@ func TestBuildPages_EntityPage(t *testing.T) {
Content: "## Description\n\nA product designer.\n", Content: "## Description\n\nA product designer.\n",
}, },
} }
pages := BuildPages(raw, "shape-up", "2026-04-23") pages, warnings := BuildPages(raw, "shape-up", "2026-04-23")
require.Len(t, pages, 1) require.Len(t, pages, 1)
assert.Empty(t, warnings)
p := pages[0] p := pages[0]
assert.Equal(t, "wiki/entities/ryan-singer.md", p.Path) assert.Equal(t, "wiki/entities/ryan-singer.md", p.Path)
@@ -84,7 +87,7 @@ func TestBuildPages_SourceSlugUsedForSourcePage(t *testing.T) {
raw := []RawPage{ raw := []RawPage{
{Title: "FinBERT: A Pretrained Model", Type: "source", Subtype: "article", Content: "## Summary\n\nA model.\n"}, {Title: "FinBERT: A Pretrained Model", Type: "source", Subtype: "article", Content: "## Summary\n\nA model.\n"},
} }
pages := BuildPages(raw, "finbert-huggingface", "2026-04-23") pages, _ := BuildPages(raw, "finbert-huggingface", "2026-04-23")
require.Len(t, pages, 1) require.Len(t, pages, 1)
assert.Equal(t, "wiki/sources/finbert-huggingface.md", pages[0].Path) assert.Equal(t, "wiki/sources/finbert-huggingface.md", pages[0].Path)
} }
@@ -93,7 +96,7 @@ func TestBuildPages_ConceptSlugDerivedFromTitle(t *testing.T) {
raw := []RawPage{ raw := []RawPage{
{Title: "Domain-Driven Design", Type: "concept", Content: "## Definition\n\nFoo.\n"}, {Title: "Domain-Driven Design", Type: "concept", Content: "## Definition\n\nFoo.\n"},
} }
pages := BuildPages(raw, "some-source", "2026-04-23") pages, _ := BuildPages(raw, "some-source", "2026-04-23")
require.Len(t, pages, 1) require.Len(t, pages, 1)
assert.Equal(t, "wiki/concepts/domain-driven-design.md", pages[0].Path) assert.Equal(t, "wiki/concepts/domain-driven-design.md", pages[0].Path)
} }
@@ -103,7 +106,7 @@ func TestBuildPages_SourceDefaultSubtype(t *testing.T) {
raw := []RawPage{ raw := []RawPage{
{Title: "Some Post", Type: "source", Content: "## Summary\n\nA post.\n"}, {Title: "Some Post", Type: "source", Content: "## Summary\n\nA post.\n"},
} }
pages := BuildPages(raw, "some-post", "2026-04-23") pages, _ := BuildPages(raw, "some-post", "2026-04-23")
require.Len(t, pages, 1) require.Len(t, pages, 1)
assert.Contains(t, pages[0].Content, "type: 'article'") assert.Contains(t, pages[0].Content, "type: 'article'")
} }
@@ -112,7 +115,7 @@ func TestBuildPages_OmitsDomainWhenEmpty(t *testing.T) {
raw := []RawPage{ raw := []RawPage{
{Title: "Betting", Type: "concept", Content: "## Definition\n\nFoo.\n"}, {Title: "Betting", Type: "concept", Content: "## Definition\n\nFoo.\n"},
} }
pages := BuildPages(raw, "src", "2026-04-23") pages, _ := BuildPages(raw, "src", "2026-04-23")
require.Len(t, pages, 1) require.Len(t, pages, 1)
assert.NotContains(t, pages[0].Content, "domain:") assert.NotContains(t, pages[0].Content, "domain:")
} }
@@ -123,7 +126,7 @@ func TestBuildPages_MultiplePages(t *testing.T) {
{Title: "Betting", Type: "concept", Content: "## Definition\n\nA technique.\n"}, {Title: "Betting", Type: "concept", Content: "## Definition\n\nA technique.\n"},
{Title: "Ryan Singer", Type: "entity", Subtype: "person", Content: "## Description\n\nA designer.\n"}, {Title: "Ryan Singer", Type: "entity", Subtype: "person", Content: "## Description\n\nA designer.\n"},
} }
pages := BuildPages(raw, "shape-up", "2026-04-23") pages, _ := BuildPages(raw, "shape-up", "2026-04-23")
require.Len(t, pages, 3) require.Len(t, pages, 3)
assert.Equal(t, "wiki/sources/shape-up.md", pages[0].Path) assert.Equal(t, "wiki/sources/shape-up.md", pages[0].Path)
assert.Equal(t, "wiki/concepts/betting.md", pages[1].Path) assert.Equal(t, "wiki/concepts/betting.md", pages[1].Path)
@@ -134,7 +137,7 @@ func TestBuildPages_TitleWithColon(t *testing.T) {
raw := []RawPage{ raw := []RawPage{
{Title: "Shape Up: The Basecamp Method", Type: "source", Subtype: "book", Content: "## Summary\n\nA book.\n"}, {Title: "Shape Up: The Basecamp Method", Type: "source", Subtype: "book", Content: "## Summary\n\nA book.\n"},
} }
pages := BuildPages(raw, "shape-up", "2026-04-23") pages, _ := BuildPages(raw, "shape-up", "2026-04-23")
require.Len(t, pages, 1) require.Len(t, pages, 1)
// Title with colon must be quoted in YAML // Title with colon must be quoted in YAML
assert.Contains(t, pages[0].Content, "title: 'Shape Up: The Basecamp Method'") assert.Contains(t, pages[0].Content, "title: 'Shape Up: The Basecamp Method'")
@@ -145,8 +148,20 @@ func TestBuildPages_EntityNoSubtype(t *testing.T) {
raw := []RawPage{ raw := []RawPage{
{Title: "Basecamp", Type: "entity", Content: "## Description\n\nA company.\n"}, {Title: "Basecamp", Type: "entity", Content: "## Description\n\nA company.\n"},
} }
pages := BuildPages(raw, "src", "2026-04-23") pages, _ := BuildPages(raw, "src", "2026-04-23")
require.Len(t, pages, 1) require.Len(t, pages, 1)
assert.NotContains(t, pages[0].Content, "type:") assert.NotContains(t, pages[0].Content, "type:")
assert.Contains(t, pages[0].Content, "title: 'Basecamp'") assert.Contains(t, pages[0].Content, "title: 'Basecamp'")
} }
func TestBuildPages_EmptyTitleSkippedWithWarning(t *testing.T) {
raw := []RawPage{
{Title: "", Type: "concept", Content: "## Definition\n\nFoo.\n"},
{Title: "Betting", Type: "concept", Content: "## Definition\n\nA technique.\n"},
}
pages, warnings := BuildPages(raw, "src", "2026-04-23")
require.Len(t, pages, 1, "empty-title page should be skipped")
assert.Equal(t, "wiki/concepts/betting.md", pages[0].Path)
assert.Len(t, warnings, 1)
assert.Contains(t, warnings[0], "empty title")
}

View File

@@ -59,7 +59,8 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
allWarnings = append(allWarnings, warnings...) allWarnings = append(allWarnings, warnings...)
} }
pages := BuildPages(allRaw, sourceSlug, date) pages, buildWarnings := BuildPages(allRaw, sourceSlug, date)
allWarnings = append(allWarnings, buildWarnings...)
resolved := Resolve(pages, inventory) resolved := Resolve(pages, inventory)
canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory) canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory)
allWarnings = append(allWarnings, linkWarnings...) allWarnings = append(allWarnings, linkWarnings...)