feat(pipeline): add CanonicalizeLinks — convert [[Display Name]] to [[slug|Display Name]]
This commit is contained in:
70
ingestion/internal/pipeline/links.go
Normal file
70
ingestion/internal/pipeline/links.go
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
// ingestion/internal/pipeline/links.go
|
||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||||
|
)
|
||||||
|
|
||||||
|
// plainLinkRE matches [[Display Name]] — wikilinks without a slug pipe.
|
||||||
|
// It does NOT match [[slug|Display]] (those already have a pipe).
|
||||||
|
var plainLinkRE = regexp.MustCompile(`\[\[([^\]|]+)\]\]`)
|
||||||
|
|
||||||
|
// CanonicalizeLinks converts [[Display Name]] wikilinks to [[slug|Display Name]]
|
||||||
|
// using a title→slug map built from the inventory and current batch.
|
||||||
|
// Unknown titles are left as-is and returned as warnings.
|
||||||
|
func CanonicalizeLinks(pages []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) ([]wiki.Page, []string) {
|
||||||
|
titleToSlug := buildTitleMap(pages, inventory)
|
||||||
|
|
||||||
|
var allWarnings []string
|
||||||
|
out := make([]wiki.Page, len(pages))
|
||||||
|
for i, p := range pages {
|
||||||
|
newContent, warnings := canonicalizeContent(p.Content, titleToSlug)
|
||||||
|
p.Content = newContent
|
||||||
|
out[i] = p
|
||||||
|
allWarnings = append(allWarnings, warnings...)
|
||||||
|
}
|
||||||
|
return out, allWarnings
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildTitleMap builds a lowercase-title → slug map from inventory and current batch.
|
||||||
|
// Current batch entries take precedence over inventory (they may be updates).
|
||||||
|
func buildTitleMap(pages []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) map[string]string {
|
||||||
|
m := make(map[string]string)
|
||||||
|
for _, entries := range inventory {
|
||||||
|
for _, e := range entries {
|
||||||
|
m[strings.ToLower(e.Title)] = e.Slug
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Current batch overrides inventory
|
||||||
|
for _, p := range pages {
|
||||||
|
title := extractTitle(p.Content)
|
||||||
|
slug := strings.TrimSuffix(filepath.Base(p.Path), ".md")
|
||||||
|
if title != "" && slug != "" {
|
||||||
|
m[strings.ToLower(title)] = slug
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
func canonicalizeContent(content string, titleToSlug map[string]string) (string, []string) {
|
||||||
|
var warnings []string
|
||||||
|
result := plainLinkRE.ReplaceAllStringFunc(content, func(match string) string {
|
||||||
|
sub := plainLinkRE.FindStringSubmatch(match)
|
||||||
|
if len(sub) < 2 {
|
||||||
|
return match
|
||||||
|
}
|
||||||
|
displayName := sub[1]
|
||||||
|
slug, ok := titleToSlug[strings.ToLower(displayName)]
|
||||||
|
if !ok {
|
||||||
|
warnings = append(warnings, fmt.Sprintf("unknown wikilink: [[%s]]", displayName))
|
||||||
|
return match
|
||||||
|
}
|
||||||
|
return "[[" + slug + "|" + displayName + "]]"
|
||||||
|
})
|
||||||
|
return result, warnings
|
||||||
|
}
|
||||||
125
ingestion/internal/pipeline/links_test.go
Normal file
125
ingestion/internal/pipeline/links_test.go
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
// ingestion/internal/pipeline/links_test.go
|
||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestCanonicalizeLinks_KnownTitle(t *testing.T) {
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/shape-up.md",
|
||||||
|
Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[Betting]].\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{
|
||||||
|
wiki.PageTypeConcept: {
|
||||||
|
{Slug: "betting", Title: "Betting"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
got, warnings := CanonicalizeLinks(pages, inventory)
|
||||||
|
require.Len(t, got, 1)
|
||||||
|
assert.Empty(t, warnings)
|
||||||
|
assert.Contains(t, got[0].Content, "[[betting|Betting]]")
|
||||||
|
assert.NotContains(t, got[0].Content, "[[Betting]]")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCanonicalizeLinks_UnknownTitleLeftAsIs(t *testing.T) {
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/shape-up.md",
|
||||||
|
Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[Ghost Concept]].\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{}
|
||||||
|
got, warnings := CanonicalizeLinks(pages, inventory)
|
||||||
|
require.Len(t, got, 1)
|
||||||
|
assert.NotEmpty(t, warnings)
|
||||||
|
assert.Contains(t, got[0].Content, "[[Ghost Concept]]")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCanonicalizeLinks_AlreadyCanonicalLinkUntouched(t *testing.T) {
|
||||||
|
// Links already in [[slug|Display]] format must not be double-converted
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/shape-up.md",
|
||||||
|
Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[betting|Betting]].\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{
|
||||||
|
wiki.PageTypeConcept: {
|
||||||
|
{Slug: "betting", Title: "Betting"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
got, warnings := CanonicalizeLinks(pages, inventory)
|
||||||
|
require.Len(t, got, 1)
|
||||||
|
assert.Empty(t, warnings)
|
||||||
|
// Should remain exactly as-is — not double-wrapped
|
||||||
|
assert.Contains(t, got[0].Content, "[[betting|Betting]]")
|
||||||
|
assert.NotContains(t, got[0].Content, "[[betting|[[betting|Betting]]]]")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCanonicalizeLinks_CaseInsensitiveMatch(t *testing.T) {
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/foo.md",
|
||||||
|
Content: "---\ntitle: 'Foo'\n---\n\n## Summary\n\nSee [[domain driven design]].\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{
|
||||||
|
wiki.PageTypeConcept: {
|
||||||
|
{Slug: "domain-driven-design", Title: "Domain Driven Design"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
got, warnings := CanonicalizeLinks(pages, inventory)
|
||||||
|
require.Len(t, got, 1)
|
||||||
|
assert.Empty(t, warnings)
|
||||||
|
assert.Contains(t, got[0].Content, "[[domain-driven-design|domain driven design]]")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCanonicalizeLinks_CurrentBatchPagesResolved(t *testing.T) {
|
||||||
|
// A concept created in the same batch should be canonicalizable
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/shape-up.md",
|
||||||
|
Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[Betting]].\n",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Path: "wiki/concepts/betting.md",
|
||||||
|
Content: "---\ntitle: 'Betting'\n---\n\n## Definition\n\nA technique.\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{} // empty — Betting is in the batch, not inventory
|
||||||
|
|
||||||
|
got, warnings := CanonicalizeLinks(pages, inventory)
|
||||||
|
require.Len(t, got, 2)
|
||||||
|
assert.Empty(t, warnings)
|
||||||
|
assert.Contains(t, got[0].Content, "[[betting|Betting]]")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCanonicalizeLinks_MultipleLinksInOnePage(t *testing.T) {
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/foo.md",
|
||||||
|
Content: "---\ntitle: 'Foo'\n---\n\n## Summary\n\nSee [[Betting]] and [[Shape Up]].\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{
|
||||||
|
wiki.PageTypeConcept: {
|
||||||
|
{Slug: "betting", Title: "Betting"},
|
||||||
|
},
|
||||||
|
wiki.PageTypeSource: {
|
||||||
|
{Slug: "shape-up", Title: "Shape Up"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
got, warnings := CanonicalizeLinks(pages, inventory)
|
||||||
|
require.Len(t, got, 1)
|
||||||
|
assert.Empty(t, warnings)
|
||||||
|
assert.Contains(t, got[0].Content, "[[betting|Betting]]")
|
||||||
|
assert.Contains(t, got[0].Content, "[[shape-up|Shape Up]]")
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user