From ae5a4d04f00b4abf29f9fd38bf13c667e72e06a5 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Wed, 22 Apr 2026 22:28:55 +0200 Subject: [PATCH] feat(ingestion): add wiki page merge logic --- ingestion/internal/wiki/merge.go | 120 ++++++++++++++++++++++++++ ingestion/internal/wiki/merge_test.go | 55 ++++++++++++ 2 files changed, 175 insertions(+) create mode 100644 ingestion/internal/wiki/merge.go create mode 100644 ingestion/internal/wiki/merge_test.go diff --git a/ingestion/internal/wiki/merge.go b/ingestion/internal/wiki/merge.go new file mode 100644 index 0000000..578bf72 --- /dev/null +++ b/ingestion/internal/wiki/merge.go @@ -0,0 +1,120 @@ +// ingestion/internal/wiki/merge.go +package wiki + +import ( + "fmt" + "strings" +) + +var bulletSections = map[string]bool{ + "Related Concepts": true, + "Related Entities": true, + "Sources": true, + "Key Claims": true, + "Entities Mentioned": true, + "Concepts Introduced or Reinforced": true, + "Chapters": true, +} + +var appendSections = map[string]bool{ + "Evolving Notes": true, + "Updates": true, + "Open Questions Raised": true, + "Open Questions": true, +} + +type section struct { + heading string + content string +} + +// Merge combines two Page values with the same path. +// Frontmatter is taken from a. Sections are merged by strategy: +// bullet sections union unique lines, append sections concatenate, +// all others keep a's version. Sections in b not present in a are appended. +func Merge(a, b Page) Page { + fmA, secsA := parseSections(a.Content) + _, secsB := parseSections(b.Content) + + idx := make(map[string]int, len(secsA)) + for i, s := range secsA { + idx[s.heading] = i + } + + for _, sB := range secsB { + i, exists := idx[sB.heading] + if !exists { + idx[sB.heading] = len(secsA) + secsA = append(secsA, sB) + continue + } + sA := secsA[i] + switch { + case bulletSections[sB.heading]: + secsA[i].content = mergeBullets(sA.content, sB.content) + case appendSections[sB.heading]: + secsA[i].content = strings.TrimRight(sA.content, "\n") + "\n\n" + strings.TrimLeft(sB.content, "\n") + } + } + + return Page{Path: a.Path, Content: rebuildContent(fmA, secsA)} +} + +func parseSections(markdown string) (frontmatter string, sections []section) { + lines := strings.Split(markdown, "\n") + i := 0 + + if i < len(lines) && strings.TrimSpace(lines[i]) == "---" { + i++ + var fmLines []string + for i < len(lines) { + if strings.TrimSpace(lines[i]) == "---" { + i++ + break + } + fmLines = append(fmLines, lines[i]) + i++ + } + frontmatter = fmt.Sprintf("---\n%s\n---\n", strings.Join(fmLines, "\n")) + } + + var cur *section + for ; i < len(lines); i++ { + line := lines[i] + if strings.HasPrefix(line, "## ") { + if cur != nil { + sections = append(sections, *cur) + } + cur = §ion{heading: strings.TrimPrefix(line, "## ")} + } else if cur != nil { + cur.content += line + "\n" + } + } + if cur != nil { + sections = append(sections, *cur) + } + return +} + +func rebuildContent(frontmatter string, sections []section) string { + var sb strings.Builder + sb.WriteString(frontmatter) + for _, sec := range sections { + fmt.Fprintf(&sb, "\n## %s\n\n%s", sec.heading, sec.content) + } + return sb.String() +} + +func mergeBullets(a, b string) string { + seen := make(map[string]bool) + var lines []string + for _, line := range strings.Split(a+b, "\n") { + trimmed := strings.TrimSpace(line) + if trimmed == "" || seen[trimmed] { + continue + } + seen[trimmed] = true + lines = append(lines, line) + } + return strings.Join(lines, "\n") + "\n" +} diff --git a/ingestion/internal/wiki/merge_test.go b/ingestion/internal/wiki/merge_test.go new file mode 100644 index 0000000..d69480d --- /dev/null +++ b/ingestion/internal/wiki/merge_test.go @@ -0,0 +1,55 @@ +// ingestion/internal/wiki/merge_test.go +package wiki + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMerge_BulletSectionsUnion(t *testing.T) { + a := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Related Concepts\n\n- [[bar|Bar]]\n"} + b := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Related Concepts\n\n- [[bar|Bar]]\n- [[baz|Baz]]\n"} + + got := Merge(a, b) + assert.Contains(t, got.Content, "[[bar|Bar]]") + assert.Contains(t, got.Content, "[[baz|Baz]]") + assert.Equal(t, 1, strings.Count(got.Content, "[[bar|Bar]]")) +} + +func TestMerge_AppendSections(t *testing.T) { + a := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Evolving Notes\n\nFirst note.\n"} + b := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Evolving Notes\n\nSecond note.\n"} + + got := Merge(a, b) + assert.Contains(t, got.Content, "First note.") + assert.Contains(t, got.Content, "Second note.") +} + +func TestMerge_KeepFirstForOtherSections(t *testing.T) { + a := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nFirst definition.\n"} + b := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nSecond definition.\n"} + + got := Merge(a, b) + assert.Contains(t, got.Content, "First definition.") + assert.NotContains(t, got.Content, "Second definition.") +} + +func TestMerge_NewSectionFromB(t *testing.T) { + a := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nA thing.\n"} + b := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Why It Matters\n\nBecause reasons.\n"} + + got := Merge(a, b) + assert.Contains(t, got.Content, "A thing.") + assert.Contains(t, got.Content, "Because reasons.") +} + +func TestMerge_KeepsFrontmatterFromA(t *testing.T) { + a := Page{Path: "p.md", Content: "---\ntitle: A\nlast_updated: 2026-01-01\n---\n\n## Definition\n\nA.\n"} + b := Page{Path: "p.md", Content: "---\ntitle: B\nlast_updated: 2026-06-01\n---\n\n## Definition\n\nB.\n"} + + got := Merge(a, b) + assert.Contains(t, got.Content, "title: A") + assert.NotContains(t, got.Content, "title: B") +}