feat(pipeline): add BuildPages — compute slugs/paths/frontmatter from RawPage

This commit is contained in:
Mathias Bergqvist
2026-04-23 18:50:37 +02:00
parent a620f6cb01
commit 7b57051af8
2 changed files with 219 additions and 0 deletions

View File

@@ -0,0 +1,88 @@
// ingestion/internal/pipeline/build.go
package pipeline
import (
"fmt"
"strings"
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
)
// BuildPages converts RawPages from the LLM into wiki.Pages with computed slugs,
// paths, and YAML frontmatter. sourceSlug is the slug of the source being ingested
// (derived from the filename, not the LLM title).
func BuildPages(rawPages []RawPage, sourceSlug, date string) []wiki.Page {
out := make([]wiki.Page, 0, len(rawPages))
for _, rp := range rawPages {
out = append(out, buildPage(rp, sourceSlug, date))
}
return out
}
func buildPage(rp RawPage, sourceSlug, date string) wiki.Page {
var slug, dir string
switch rp.Type {
case "source":
slug = sourceSlug
dir = "wiki/sources"
case "concept":
slug = wiki.Slug(rp.Title)
dir = "wiki/concepts"
case "entity":
slug = wiki.Slug(rp.Title)
dir = "wiki/entities"
default:
slug = wiki.Slug(rp.Title)
dir = "wiki/" + rp.Type
}
path := dir + "/" + slug + ".md"
fm := buildFrontmatter(rp, date)
return wiki.Page{
Path: path,
Content: fm + "\n" + rp.Content,
}
}
func buildFrontmatter(rp RawPage, date string) string {
var sb strings.Builder
sb.WriteString("---\n")
fmt.Fprintf(&sb, "title: %s\n", rp.Title)
switch rp.Type {
case "source":
subtype := rp.Subtype
if subtype == "" {
subtype = "article"
}
fmt.Fprintf(&sb, "type: %s\n", subtype)
if rp.Domain != "" {
fmt.Fprintf(&sb, "domain: %s\n", rp.Domain)
}
fmt.Fprintf(&sb, "date_ingested: %s\n", date)
fmt.Fprintf(&sb, "last_updated: %s\n", date)
case "concept":
if rp.Domain != "" {
fmt.Fprintf(&sb, "domain: %s\n", rp.Domain)
}
fmt.Fprintf(&sb, "last_updated: %s\n", date)
case "entity":
if rp.Subtype != "" {
fmt.Fprintf(&sb, "type: %s\n", rp.Subtype)
}
if rp.Domain != "" {
fmt.Fprintf(&sb, "domain: %s\n", rp.Domain)
}
fmt.Fprintf(&sb, "last_updated: %s\n", date)
default:
if rp.Domain != "" {
fmt.Fprintf(&sb, "domain: %s\n", rp.Domain)
}
fmt.Fprintf(&sb, "last_updated: %s\n", date)
}
fmt.Fprintf(&sb, "aliases:\n - %s\n", rp.Title)
sb.WriteString("---\n")
return sb.String()
}

View File

@@ -0,0 +1,131 @@
// ingestion/internal/pipeline/build_test.go
package pipeline
import (
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestBuildPages_SourcePage(t *testing.T) {
raw := []RawPage{
{
Title: "Shape Up",
Type: "source",
Subtype: "book",
Domain: "product-strategy",
Content: "## Summary\n\nA book about shaping product work.\n",
},
}
pages := BuildPages(raw, "shape-up", "2026-04-23")
require.Len(t, pages, 1)
p := pages[0]
assert.Equal(t, "wiki/sources/shape-up.md", p.Path)
assert.Contains(t, p.Content, "title: Shape Up")
assert.Contains(t, p.Content, "type: book")
assert.Contains(t, p.Content, "domain: product-strategy")
assert.Contains(t, p.Content, "date_ingested: 2026-04-23")
assert.Contains(t, p.Content, "last_updated: 2026-04-23")
assert.Contains(t, p.Content, "aliases:\n - Shape Up")
assert.Contains(t, p.Content, "## Summary")
assert.True(t, strings.HasPrefix(p.Content, "---\n"), "content must start with frontmatter")
}
func TestBuildPages_ConceptPage(t *testing.T) {
raw := []RawPage{
{
Title: "Betting",
Type: "concept",
Domain: "product-strategy",
Content: "## Definition\n\nA resource allocation technique.\n",
},
}
pages := BuildPages(raw, "shape-up", "2026-04-23")
require.Len(t, pages, 1)
p := pages[0]
assert.Equal(t, "wiki/concepts/betting.md", p.Path)
assert.Contains(t, p.Content, "title: Betting")
assert.Contains(t, p.Content, "domain: product-strategy")
assert.Contains(t, p.Content, "last_updated: 2026-04-23")
assert.Contains(t, p.Content, "aliases:\n - Betting")
assert.NotContains(t, p.Content, "date_ingested")
assert.Contains(t, p.Content, "## Definition")
}
func TestBuildPages_EntityPage(t *testing.T) {
raw := []RawPage{
{
Title: "Ryan Singer",
Type: "entity",
Subtype: "person",
Domain: "product-strategy",
Content: "## Description\n\nA product designer.\n",
},
}
pages := BuildPages(raw, "shape-up", "2026-04-23")
require.Len(t, pages, 1)
p := pages[0]
assert.Equal(t, "wiki/entities/ryan-singer.md", p.Path)
assert.Contains(t, p.Content, "title: Ryan Singer")
assert.Contains(t, p.Content, "type: person")
assert.Contains(t, p.Content, "domain: product-strategy")
assert.Contains(t, p.Content, "last_updated: 2026-04-23")
assert.Contains(t, p.Content, "aliases:\n - Ryan Singer")
assert.NotContains(t, p.Content, "date_ingested")
}
func TestBuildPages_SourceSlugUsedForSourcePage(t *testing.T) {
// LLM title differs from filename — pipeline uses sourceSlug for the source page path.
raw := []RawPage{
{Title: "FinBERT: A Pretrained Model", Type: "source", Subtype: "article", Content: "## Summary\n\nA model.\n"},
}
pages := BuildPages(raw, "finbert-huggingface", "2026-04-23")
require.Len(t, pages, 1)
assert.Equal(t, "wiki/sources/finbert-huggingface.md", pages[0].Path)
}
func TestBuildPages_ConceptSlugDerivedFromTitle(t *testing.T) {
raw := []RawPage{
{Title: "Domain-Driven Design", Type: "concept", Content: "## Definition\n\nFoo.\n"},
}
pages := BuildPages(raw, "some-source", "2026-04-23")
require.Len(t, pages, 1)
assert.Equal(t, "wiki/concepts/domain-driven-design.md", pages[0].Path)
}
func TestBuildPages_SourceDefaultSubtype(t *testing.T) {
// If subtype is omitted for a source, default to "article"
raw := []RawPage{
{Title: "Some Post", Type: "source", Content: "## Summary\n\nA post.\n"},
}
pages := BuildPages(raw, "some-post", "2026-04-23")
require.Len(t, pages, 1)
assert.Contains(t, pages[0].Content, "type: article")
}
func TestBuildPages_OmitsDomainWhenEmpty(t *testing.T) {
raw := []RawPage{
{Title: "Betting", Type: "concept", Content: "## Definition\n\nFoo.\n"},
}
pages := BuildPages(raw, "src", "2026-04-23")
require.Len(t, pages, 1)
assert.NotContains(t, pages[0].Content, "domain:")
}
func TestBuildPages_MultiplePages(t *testing.T) {
raw := []RawPage{
{Title: "Shape Up", Type: "source", Subtype: "book", Content: "## Summary\n\nA book.\n"},
{Title: "Betting", Type: "concept", Content: "## Definition\n\nA technique.\n"},
{Title: "Ryan Singer", Type: "entity", Subtype: "person", Content: "## Description\n\nA designer.\n"},
}
pages := BuildPages(raw, "shape-up", "2026-04-23")
require.Len(t, pages, 3)
assert.Equal(t, "wiki/sources/shape-up.md", pages[0].Path)
assert.Equal(t, "wiki/concepts/betting.md", pages[1].Path)
assert.Equal(t, "wiki/entities/ryan-singer.md", pages[2].Path)
}