From a7b363d589632e5bcd0a38b87284c59eb94eb4dd Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 18:56:39 +0200 Subject: [PATCH] fix(pipeline): quote YAML scalar fields in buildFrontmatter to prevent injection --- ingestion/internal/pipeline/build.go | 20 ++++++---- ingestion/internal/pipeline/build_test.go | 45 +++++++++++++++++------ 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/ingestion/internal/pipeline/build.go b/ingestion/internal/pipeline/build.go index aad4a2d..eb324af 100644 --- a/ingestion/internal/pipeline/build.go +++ b/ingestion/internal/pipeline/build.go @@ -48,7 +48,7 @@ func buildPage(rp RawPage, sourceSlug, date string) wiki.Page { func buildFrontmatter(rp RawPage, date string) string { var sb strings.Builder sb.WriteString("---\n") - fmt.Fprintf(&sb, "title: %s\n", rp.Title) + fmt.Fprintf(&sb, "title: %s\n", yamlScalar(rp.Title)) switch rp.Type { case "source": @@ -56,33 +56,37 @@ func buildFrontmatter(rp RawPage, date string) string { if subtype == "" { subtype = "article" } - fmt.Fprintf(&sb, "type: %s\n", subtype) + fmt.Fprintf(&sb, "type: %s\n", yamlScalar(subtype)) if rp.Domain != "" { - fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + fmt.Fprintf(&sb, "domain: %s\n", yamlScalar(rp.Domain)) } fmt.Fprintf(&sb, "date_ingested: %s\n", date) fmt.Fprintf(&sb, "last_updated: %s\n", date) case "concept": if rp.Domain != "" { - fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + fmt.Fprintf(&sb, "domain: %s\n", yamlScalar(rp.Domain)) } fmt.Fprintf(&sb, "last_updated: %s\n", date) case "entity": if rp.Subtype != "" { - fmt.Fprintf(&sb, "type: %s\n", rp.Subtype) + fmt.Fprintf(&sb, "type: %s\n", yamlScalar(rp.Subtype)) } if rp.Domain != "" { - fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + fmt.Fprintf(&sb, "domain: %s\n", yamlScalar(rp.Domain)) } fmt.Fprintf(&sb, "last_updated: %s\n", date) default: if rp.Domain != "" { - fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + fmt.Fprintf(&sb, "domain: %s\n", yamlScalar(rp.Domain)) } fmt.Fprintf(&sb, "last_updated: %s\n", date) } - fmt.Fprintf(&sb, "aliases:\n - %s\n", rp.Title) + fmt.Fprintf(&sb, "aliases:\n - %s\n", yamlScalar(rp.Title)) sb.WriteString("---\n") return sb.String() } + +func yamlScalar(s string) string { + return "'" + strings.ReplaceAll(s, "'", "''") + "'" +} diff --git a/ingestion/internal/pipeline/build_test.go b/ingestion/internal/pipeline/build_test.go index 8f127cb..19c2f3d 100644 --- a/ingestion/internal/pipeline/build_test.go +++ b/ingestion/internal/pipeline/build_test.go @@ -24,12 +24,12 @@ func TestBuildPages_SourcePage(t *testing.T) { p := pages[0] assert.Equal(t, "wiki/sources/shape-up.md", p.Path) - assert.Contains(t, p.Content, "title: Shape Up") - assert.Contains(t, p.Content, "type: book") - assert.Contains(t, p.Content, "domain: product-strategy") + assert.Contains(t, p.Content, "title: 'Shape Up'") + assert.Contains(t, p.Content, "type: 'book'") + assert.Contains(t, p.Content, "domain: 'product-strategy'") assert.Contains(t, p.Content, "date_ingested: 2026-04-23") assert.Contains(t, p.Content, "last_updated: 2026-04-23") - assert.Contains(t, p.Content, "aliases:\n - Shape Up") + assert.Contains(t, p.Content, "aliases:\n - 'Shape Up'") assert.Contains(t, p.Content, "## Summary") assert.True(t, strings.HasPrefix(p.Content, "---\n"), "content must start with frontmatter") } @@ -48,10 +48,10 @@ func TestBuildPages_ConceptPage(t *testing.T) { p := pages[0] assert.Equal(t, "wiki/concepts/betting.md", p.Path) - assert.Contains(t, p.Content, "title: Betting") - assert.Contains(t, p.Content, "domain: product-strategy") + assert.Contains(t, p.Content, "title: 'Betting'") + assert.Contains(t, p.Content, "domain: 'product-strategy'") assert.Contains(t, p.Content, "last_updated: 2026-04-23") - assert.Contains(t, p.Content, "aliases:\n - Betting") + assert.Contains(t, p.Content, "aliases:\n - 'Betting'") assert.NotContains(t, p.Content, "date_ingested") assert.Contains(t, p.Content, "## Definition") } @@ -71,11 +71,11 @@ func TestBuildPages_EntityPage(t *testing.T) { p := pages[0] assert.Equal(t, "wiki/entities/ryan-singer.md", p.Path) - assert.Contains(t, p.Content, "title: Ryan Singer") - assert.Contains(t, p.Content, "type: person") - assert.Contains(t, p.Content, "domain: product-strategy") + assert.Contains(t, p.Content, "title: 'Ryan Singer'") + assert.Contains(t, p.Content, "type: 'person'") + assert.Contains(t, p.Content, "domain: 'product-strategy'") assert.Contains(t, p.Content, "last_updated: 2026-04-23") - assert.Contains(t, p.Content, "aliases:\n - Ryan Singer") + assert.Contains(t, p.Content, "aliases:\n - 'Ryan Singer'") assert.NotContains(t, p.Content, "date_ingested") } @@ -105,7 +105,7 @@ func TestBuildPages_SourceDefaultSubtype(t *testing.T) { } pages := BuildPages(raw, "some-post", "2026-04-23") require.Len(t, pages, 1) - assert.Contains(t, pages[0].Content, "type: article") + assert.Contains(t, pages[0].Content, "type: 'article'") } func TestBuildPages_OmitsDomainWhenEmpty(t *testing.T) { @@ -129,3 +129,24 @@ func TestBuildPages_MultiplePages(t *testing.T) { assert.Equal(t, "wiki/concepts/betting.md", pages[1].Path) assert.Equal(t, "wiki/entities/ryan-singer.md", pages[2].Path) } + +func TestBuildPages_TitleWithColon(t *testing.T) { + raw := []RawPage{ + {Title: "Shape Up: The Basecamp Method", Type: "source", Subtype: "book", Content: "## Summary\n\nA book.\n"}, + } + pages := BuildPages(raw, "shape-up", "2026-04-23") + require.Len(t, pages, 1) + // Title with colon must be quoted in YAML + assert.Contains(t, pages[0].Content, "title: 'Shape Up: The Basecamp Method'") + assert.Contains(t, pages[0].Content, "aliases:\n - 'Shape Up: The Basecamp Method'") +} + +func TestBuildPages_EntityNoSubtype(t *testing.T) { + raw := []RawPage{ + {Title: "Basecamp", Type: "entity", Content: "## Description\n\nA company.\n"}, + } + pages := BuildPages(raw, "src", "2026-04-23") + require.Len(t, pages, 1) + assert.NotContains(t, pages[0].Content, "type:") + assert.Contains(t, pages[0].Content, "title: 'Basecamp'") +}