feat(pipeline): inject source back-references into concept and entity pages
This commit is contained in:
115
ingestion/internal/pipeline/refs.go
Normal file
115
ingestion/internal/pipeline/refs.go
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
// ingestion/internal/pipeline/refs.go
|
||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||||
|
)
|
||||||
|
|
||||||
|
var wikilinkRE = regexp.MustCompile(`\[\[([^|\]]+)\|`)
|
||||||
|
|
||||||
|
// injectSourceRefs finds the source page in the proposed batch, extracts its
|
||||||
|
// wikilinks, and injects a back-reference into every linked concept or entity page.
|
||||||
|
// Pages that exist on disk but are not in the current batch are loaded and
|
||||||
|
// appended so they will be updated on write.
|
||||||
|
func injectSourceRefs(pages []wiki.Page, inventory map[wiki.PageType][]wiki.Entry, brainDir string) []wiki.Page {
|
||||||
|
sourceSlug, sourceTitle, found := findSourcePage(pages)
|
||||||
|
if !found {
|
||||||
|
return pages
|
||||||
|
}
|
||||||
|
|
||||||
|
var sourceContent string
|
||||||
|
for _, p := range pages {
|
||||||
|
if strings.HasPrefix(p.Path, "wiki/sources/") &&
|
||||||
|
strings.TrimSuffix(filepath.Base(p.Path), ".md") == sourceSlug {
|
||||||
|
sourceContent = p.Content
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
linkedSlugs := extractWikilinks(sourceContent)
|
||||||
|
sourceRef := "- [[" + sourceSlug + "|" + sourceTitle + "]]"
|
||||||
|
|
||||||
|
bySlug := make(map[string]int, len(pages))
|
||||||
|
for i, p := range pages {
|
||||||
|
if !strings.HasPrefix(p.Path, "wiki/sources/") {
|
||||||
|
bySlug[strings.TrimSuffix(filepath.Base(p.Path), ".md")] = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for slug := range linkedSlugs {
|
||||||
|
if slug == sourceSlug {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if idx, ok := bySlug[slug]; ok {
|
||||||
|
pages[idx] = addSourceRef(pages[idx], sourceRef)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
pt, ok := findInInventory(slug, inventory)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
diskPath := filepath.Join(brainDir, "wiki", string(pt), slug+".md")
|
||||||
|
b, err := os.ReadFile(diskPath)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
page := wiki.Page{
|
||||||
|
Path: "wiki/" + string(pt) + "/" + slug + ".md",
|
||||||
|
Content: string(b),
|
||||||
|
}
|
||||||
|
pages = append(pages, addSourceRef(page, sourceRef))
|
||||||
|
}
|
||||||
|
|
||||||
|
return pages
|
||||||
|
}
|
||||||
|
|
||||||
|
// addSourceRef injects sourceRef into the ## Sources bullet section of page
|
||||||
|
// using wiki.Merge, which deduplicates bullets automatically.
|
||||||
|
func addSourceRef(page wiki.Page, sourceRef string) wiki.Page {
|
||||||
|
patch := wiki.Page{
|
||||||
|
Path: page.Path,
|
||||||
|
Content: "\n## Sources\n\n" + sourceRef + "\n",
|
||||||
|
}
|
||||||
|
return wiki.Merge(page, patch)
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractWikilinks returns the set of slugs referenced as [[slug|...]] in content.
|
||||||
|
func extractWikilinks(content string) map[string]bool {
|
||||||
|
slugs := make(map[string]bool)
|
||||||
|
for _, m := range wikilinkRE.FindAllStringSubmatch(content, -1) {
|
||||||
|
slugs[m[1]] = true
|
||||||
|
}
|
||||||
|
return slugs
|
||||||
|
}
|
||||||
|
|
||||||
|
// findSourcePage returns the slug and title of the first wiki/sources/ page in pages.
|
||||||
|
func findSourcePage(pages []wiki.Page) (slug, title string, found bool) {
|
||||||
|
for _, p := range pages {
|
||||||
|
if strings.HasPrefix(p.Path, "wiki/sources/") {
|
||||||
|
slug = strings.TrimSuffix(filepath.Base(p.Path), ".md")
|
||||||
|
title = extractTitle(p.Content)
|
||||||
|
if title == "" {
|
||||||
|
title = slug
|
||||||
|
}
|
||||||
|
return slug, title, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
// findInInventory returns the PageType for a slug if it appears in the inventory.
|
||||||
|
func findInInventory(slug string, inventory map[wiki.PageType][]wiki.Entry) (wiki.PageType, bool) {
|
||||||
|
for pt, entries := range inventory {
|
||||||
|
for _, e := range entries {
|
||||||
|
if e.Slug == slug {
|
||||||
|
return pt, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
172
ingestion/internal/pipeline/refs_test.go
Normal file
172
ingestion/internal/pipeline/refs_test.go
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
// ingestion/internal/pipeline/refs_test.go
|
||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||||
|
)
|
||||||
|
|
||||||
|
func makeInventory(concepts, entities []string) map[wiki.PageType][]wiki.Entry {
|
||||||
|
inv := map[wiki.PageType][]wiki.Entry{
|
||||||
|
wiki.PageTypeConcept: {},
|
||||||
|
wiki.PageTypeEntity: {},
|
||||||
|
wiki.PageTypeSource: {},
|
||||||
|
}
|
||||||
|
for _, slug := range concepts {
|
||||||
|
inv[wiki.PageTypeConcept] = append(inv[wiki.PageTypeConcept], wiki.Entry{Slug: slug, Title: slug})
|
||||||
|
}
|
||||||
|
for _, slug := range entities {
|
||||||
|
inv[wiki.PageTypeEntity] = append(inv[wiki.PageTypeEntity], wiki.Entry{Slug: slug, Title: slug})
|
||||||
|
}
|
||||||
|
return inv
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInjectSourceRefs_NoSourcePage(t *testing.T) {
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nFoo.\n"},
|
||||||
|
}
|
||||||
|
got := injectSourceRefs(pages, makeInventory(nil, nil), t.TempDir())
|
||||||
|
assert.Equal(t, pages, got)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInjectSourceRefs_InjectsIntoProposedConcept(t *testing.T) {
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/my-article.md",
|
||||||
|
Content: "---\ntitle: My Article\n---\n\n## Summary\n\nSee [[domain-driven-design|Domain Driven Design]].\n",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Path: "wiki/concepts/domain-driven-design.md",
|
||||||
|
Content: "---\ntitle: Domain Driven Design\n---\n\n## Definition\n\nA methodology.\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := injectSourceRefs(pages, makeInventory(nil, nil), t.TempDir())
|
||||||
|
|
||||||
|
require.Len(t, got, 2)
|
||||||
|
assert.Contains(t, got[1].Content, "## Sources")
|
||||||
|
assert.Contains(t, got[1].Content, "[[my-article|My Article]]")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInjectSourceRefs_LoadsConceptFromDisk(t *testing.T) {
|
||||||
|
brainDir := t.TempDir()
|
||||||
|
conceptDir := filepath.Join(brainDir, "wiki", "concepts")
|
||||||
|
require.NoError(t, os.MkdirAll(conceptDir, 0o755))
|
||||||
|
require.NoError(t, os.WriteFile(
|
||||||
|
filepath.Join(conceptDir, "shape-up.md"),
|
||||||
|
[]byte("---\ntitle: Shape Up\n---\n\n## Definition\n\nA methodology.\n"),
|
||||||
|
0o644,
|
||||||
|
))
|
||||||
|
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/my-article.md",
|
||||||
|
Content: "---\ntitle: My Article\n---\n\n## Summary\n\nSee [[shape-up|Shape Up]].\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
inv := makeInventory([]string{"shape-up"}, nil)
|
||||||
|
|
||||||
|
got := injectSourceRefs(pages, inv, brainDir)
|
||||||
|
|
||||||
|
require.Len(t, got, 2)
|
||||||
|
var conceptPage wiki.Page
|
||||||
|
for _, p := range got {
|
||||||
|
if p.Path == "wiki/concepts/shape-up.md" {
|
||||||
|
conceptPage = p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert.Contains(t, conceptPage.Content, "## Sources")
|
||||||
|
assert.Contains(t, conceptPage.Content, "[[my-article|My Article]]")
|
||||||
|
assert.Contains(t, conceptPage.Content, "## Definition")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInjectSourceRefs_NoSelfReference(t *testing.T) {
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/my-article.md",
|
||||||
|
Content: "---\ntitle: My Article\n---\n\n## Summary\n\nSelf-link [[my-article|My Article]].\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := injectSourceRefs(pages, makeInventory(nil, nil), t.TempDir())
|
||||||
|
assert.Len(t, got, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInjectSourceRefs_DeduplicatesOnReingestion(t *testing.T) {
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/my-article.md",
|
||||||
|
Content: "---\ntitle: My Article\n---\n\n## Summary\n\nSee [[ddd|DDD]].\n",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Path: "wiki/concepts/ddd.md",
|
||||||
|
Content: "---\ntitle: DDD\n---\n\n## Definition\n\nA thing.\n\n## Sources\n\n- [[my-article|My Article]]\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := injectSourceRefs(pages, makeInventory(nil, nil), t.TempDir())
|
||||||
|
|
||||||
|
require.Len(t, got, 2)
|
||||||
|
count := 0
|
||||||
|
for _, line := range splitLines(got[1].Content) {
|
||||||
|
if line == "- [[my-article|My Article]]" {
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert.Equal(t, 1, count, "source ref should appear exactly once")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInjectSourceRefs_InjectsIntoEntity(t *testing.T) {
|
||||||
|
pages := []wiki.Page{
|
||||||
|
{
|
||||||
|
Path: "wiki/sources/book.md",
|
||||||
|
Content: "---\ntitle: Book\n---\n\n## Summary\n\nBy [[ryan-singer|Ryan Singer]].\n",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Path: "wiki/entities/ryan-singer.md",
|
||||||
|
Content: "---\ntitle: Ryan Singer\n---\n\n## Description\n\nA designer.\n",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := injectSourceRefs(pages, makeInventory(nil, nil), t.TempDir())
|
||||||
|
|
||||||
|
require.Len(t, got, 2)
|
||||||
|
var entity wiki.Page
|
||||||
|
for _, p := range got {
|
||||||
|
if p.Path == "wiki/entities/ryan-singer.md" {
|
||||||
|
entity = p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert.Contains(t, entity.Content, "[[book|Book]]")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractWikilinks(t *testing.T) {
|
||||||
|
content := "See [[foo|Foo]] and [[bar|Bar]] and [[foo|Foo again]]."
|
||||||
|
got := extractWikilinks(content)
|
||||||
|
assert.True(t, got["foo"])
|
||||||
|
assert.True(t, got["bar"])
|
||||||
|
assert.Len(t, got, 2, "duplicate slugs should be deduplicated")
|
||||||
|
}
|
||||||
|
|
||||||
|
func splitLines(s string) []string {
|
||||||
|
var out []string
|
||||||
|
start := 0
|
||||||
|
for i := 0; i < len(s); i++ {
|
||||||
|
if s[i] == '\n' {
|
||||||
|
if line := s[start:i]; line != "" {
|
||||||
|
out = append(out, line)
|
||||||
|
}
|
||||||
|
start = i + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if last := s[start:]; last != "" {
|
||||||
|
out = append(out, last)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user