feat(pipeline): add CanonicalizeLinks — convert [[Display Name]] to [[slug|Display Name]]

This commit is contained in:
Mathias Bergqvist
2026-04-23 18:59:10 +02:00
parent a7b363d589
commit 26855f69b0
2 changed files with 195 additions and 0 deletions

View File

@@ -0,0 +1,70 @@
// ingestion/internal/pipeline/links.go
package pipeline
import (
"fmt"
"path/filepath"
"regexp"
"strings"
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
)
// plainLinkRE matches [[Display Name]] — wikilinks without a slug pipe.
// It does NOT match [[slug|Display]] (those already have a pipe).
var plainLinkRE = regexp.MustCompile(`\[\[([^\]|]+)\]\]`)
// CanonicalizeLinks converts [[Display Name]] wikilinks to [[slug|Display Name]]
// using a title→slug map built from the inventory and current batch.
// Unknown titles are left as-is and returned as warnings.
func CanonicalizeLinks(pages []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) ([]wiki.Page, []string) {
titleToSlug := buildTitleMap(pages, inventory)
var allWarnings []string
out := make([]wiki.Page, len(pages))
for i, p := range pages {
newContent, warnings := canonicalizeContent(p.Content, titleToSlug)
p.Content = newContent
out[i] = p
allWarnings = append(allWarnings, warnings...)
}
return out, allWarnings
}
// buildTitleMap builds a lowercase-title → slug map from inventory and current batch.
// Current batch entries take precedence over inventory (they may be updates).
func buildTitleMap(pages []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) map[string]string {
m := make(map[string]string)
for _, entries := range inventory {
for _, e := range entries {
m[strings.ToLower(e.Title)] = e.Slug
}
}
// Current batch overrides inventory
for _, p := range pages {
title := extractTitle(p.Content)
slug := strings.TrimSuffix(filepath.Base(p.Path), ".md")
if title != "" && slug != "" {
m[strings.ToLower(title)] = slug
}
}
return m
}
func canonicalizeContent(content string, titleToSlug map[string]string) (string, []string) {
var warnings []string
result := plainLinkRE.ReplaceAllStringFunc(content, func(match string) string {
sub := plainLinkRE.FindStringSubmatch(match)
if len(sub) < 2 {
return match
}
displayName := sub[1]
slug, ok := titleToSlug[strings.ToLower(displayName)]
if !ok {
warnings = append(warnings, fmt.Sprintf("unknown wikilink: [[%s]]", displayName))
return match
}
return "[[" + slug + "|" + displayName + "]]"
})
return result, warnings
}

View File

@@ -0,0 +1,125 @@
// ingestion/internal/pipeline/links_test.go
package pipeline
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
)
func TestCanonicalizeLinks_KnownTitle(t *testing.T) {
pages := []wiki.Page{
{
Path: "wiki/sources/shape-up.md",
Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[Betting]].\n",
},
}
inventory := map[wiki.PageType][]wiki.Entry{
wiki.PageTypeConcept: {
{Slug: "betting", Title: "Betting"},
},
}
got, warnings := CanonicalizeLinks(pages, inventory)
require.Len(t, got, 1)
assert.Empty(t, warnings)
assert.Contains(t, got[0].Content, "[[betting|Betting]]")
assert.NotContains(t, got[0].Content, "[[Betting]]")
}
func TestCanonicalizeLinks_UnknownTitleLeftAsIs(t *testing.T) {
pages := []wiki.Page{
{
Path: "wiki/sources/shape-up.md",
Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[Ghost Concept]].\n",
},
}
inventory := map[wiki.PageType][]wiki.Entry{}
got, warnings := CanonicalizeLinks(pages, inventory)
require.Len(t, got, 1)
assert.NotEmpty(t, warnings)
assert.Contains(t, got[0].Content, "[[Ghost Concept]]")
}
func TestCanonicalizeLinks_AlreadyCanonicalLinkUntouched(t *testing.T) {
// Links already in [[slug|Display]] format must not be double-converted
pages := []wiki.Page{
{
Path: "wiki/sources/shape-up.md",
Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[betting|Betting]].\n",
},
}
inventory := map[wiki.PageType][]wiki.Entry{
wiki.PageTypeConcept: {
{Slug: "betting", Title: "Betting"},
},
}
got, warnings := CanonicalizeLinks(pages, inventory)
require.Len(t, got, 1)
assert.Empty(t, warnings)
// Should remain exactly as-is — not double-wrapped
assert.Contains(t, got[0].Content, "[[betting|Betting]]")
assert.NotContains(t, got[0].Content, "[[betting|[[betting|Betting]]]]")
}
func TestCanonicalizeLinks_CaseInsensitiveMatch(t *testing.T) {
pages := []wiki.Page{
{
Path: "wiki/sources/foo.md",
Content: "---\ntitle: 'Foo'\n---\n\n## Summary\n\nSee [[domain driven design]].\n",
},
}
inventory := map[wiki.PageType][]wiki.Entry{
wiki.PageTypeConcept: {
{Slug: "domain-driven-design", Title: "Domain Driven Design"},
},
}
got, warnings := CanonicalizeLinks(pages, inventory)
require.Len(t, got, 1)
assert.Empty(t, warnings)
assert.Contains(t, got[0].Content, "[[domain-driven-design|domain driven design]]")
}
func TestCanonicalizeLinks_CurrentBatchPagesResolved(t *testing.T) {
// A concept created in the same batch should be canonicalizable
pages := []wiki.Page{
{
Path: "wiki/sources/shape-up.md",
Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[Betting]].\n",
},
{
Path: "wiki/concepts/betting.md",
Content: "---\ntitle: 'Betting'\n---\n\n## Definition\n\nA technique.\n",
},
}
inventory := map[wiki.PageType][]wiki.Entry{} // empty — Betting is in the batch, not inventory
got, warnings := CanonicalizeLinks(pages, inventory)
require.Len(t, got, 2)
assert.Empty(t, warnings)
assert.Contains(t, got[0].Content, "[[betting|Betting]]")
}
func TestCanonicalizeLinks_MultipleLinksInOnePage(t *testing.T) {
pages := []wiki.Page{
{
Path: "wiki/sources/foo.md",
Content: "---\ntitle: 'Foo'\n---\n\n## Summary\n\nSee [[Betting]] and [[Shape Up]].\n",
},
}
inventory := map[wiki.PageType][]wiki.Entry{
wiki.PageTypeConcept: {
{Slug: "betting", Title: "Betting"},
},
wiki.PageTypeSource: {
{Slug: "shape-up", Title: "Shape Up"},
},
}
got, warnings := CanonicalizeLinks(pages, inventory)
require.Len(t, got, 1)
assert.Empty(t, warnings)
assert.Contains(t, got[0].Content, "[[betting|Betting]]")
assert.Contains(t, got[0].Content, "[[shape-up|Shape Up]]")
}