feat(graph): add brain_entities + brain_edges store and wikilink parser

Foundation for Track A (GraphRAG on top of existing wiki). Two new packages, both unwired — service behaviour unchanged until commit 2 hooks the pipeline. - internal/graph: pure parser. Extract() walks markdown + frontmatter and emits one Entity + N wikilink Edges per doc. Dedupes per (dst, line), ignores self-references, classifies hall/concept/entity/ source/knowledge from path layout. - internal/graphstore: pgx-backed PGStore mirroring vectorstore's shape. Idempotent Init() creates brain_entities + brain_edges with indexes on src_slug, dst_slug, src_doc, wing, type. Operations: UpsertEntity, ReplaceEdgesForDoc (tx), DeleteByDoc, Neighbors, Subgraph (recursive CTE, depth ≤6), Path (shortest path, depth ≤8). Schema lives on the shared postgres18 instance alongside the brain_embeddings table — no new datastore. See docs/superpowers/specs/2026-05-homelab-training-graph-next-step.md in infra repo + infra#62. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 15:18:08 +02:00
parent c153e9105c
commit f53ee18cb6
3 changed files with 657 additions and 0 deletions
--- a/ingestion/internal/graph/extract.go
+++ b/ingestion/internal/graph/extract.go
@@ -0,0 +1,200 @@
+// Package graph extracts entity + edge records from brain markdown
+// documents for the brain_entities / brain_edges relational graph.
+//
+// The extractor is pure: it takes markdown bytes and a document path and
+// returns the entity (one per doc) and the wikilink edges (zero or more)
+// it found, with source line numbers so the graph store can record
+// provenance.
+//
+// Edge types in v1: only "wikilink" — derived from [[slug]] and
+// [[slug|Display]] occurrences in the body. Section-header edges are
+// deferred (see infra#62 grill addendum).
+package graph
+
+import (
+	"bufio"
+	"bytes"
+	"path/filepath"
+	"regexp"
+	"strings"
+)
+
+// Entity represents one brain document for graph indexing.
+//
+// Slug is the basename without ".md" — the same identity used by
+// wiki canonicalization and the wikilink target syntax.
+//
+// Type categorises the doc into a coarse bucket so callers can filter
+// graph traversals (e.g. "only entity nodes"). When the doc lives
+// under brain/wiki/<wing>/<hall>/, Wing and Hall capture the
+// taxonomy; otherwise they're empty (legacy brain/knowledge/ docs).
+type Entity struct {
+	DocPath string // forward-slash, relative to brainDir
+	Slug    string
+	Type    string // "concept" | "entity" | "source" | "hall" | "knowledge"
+	Wing    string // optional; from frontmatter or path
+	Hall    string // optional; from frontmatter or path
+	Title   string // optional; from frontmatter
+}
+
+// Edge represents a directed relationship between two slugs.
+//
+// SrcLine is the 1-indexed line in the source document where the link
+// was found, so callers can re-find the linking text after an edit.
+type Edge struct {
+	SrcDoc   string // forward-slash, relative to brainDir
+	SrcSlug  string // == Entity.Slug for SrcDoc
+	DstSlug  string
+	EdgeType string // "wikilink" in v1
+	SrcLine  int    // 1-indexed
+}
+
+// linkRE matches both [[slug]] and [[slug|Display Name]] wikilinks.
+// Group 1 is the slug; group 2 (if present) is the display.
+var linkRE = regexp.MustCompile(`\[\[([^\]|]+)(?:\|([^\]]+))?\]\]`)
+
+// Extract parses one markdown document and returns its Entity plus the
+// outgoing wikilink Edges. docPath is forward-slash, relative to
+// brainDir; content is the raw markdown bytes.
+//
+// Returns ok=false when docPath does not yield a usable slug (e.g.
+// non-markdown file slipped through).
+func Extract(docPath string, content []byte) (Entity, []Edge, bool) {
+	slug := slugFromPath(docPath)
+	if slug == "" {
+		return Entity{}, nil, false
+	}
+	ent := Entity{DocPath: docPath, Slug: slug}
+	classifyByPath(&ent, docPath)
+	readFrontmatter(&ent, content)
+
+	edges := extractEdges(docPath, slug, content)
+	return ent, edges, true
+}
+
+func slugFromPath(docPath string) string {
+	base := filepath.Base(docPath)
+	if !strings.HasSuffix(base, ".md") {
+		return ""
+	}
+	return strings.TrimSuffix(base, ".md")
+}
+
+// classifyByPath fills Type / Wing / Hall from the path layout when the
+// doc lives under brain/wiki/. Layout: wiki/<wing>/<hall>/<slug>.md
+// or wiki/<bucket>/<slug>.md for the legacy concept/entity/source dirs.
+func classifyByPath(e *Entity, docPath string) {
+	parts := strings.Split(docPath, "/")
+	if len(parts) < 2 || parts[0] != "wiki" {
+		e.Type = "knowledge"
+		return
+	}
+	switch parts[1] {
+	case "concepts":
+		e.Type = "concept"
+	case "entities":
+		e.Type = "entity"
+	case "sources":
+		e.Type = "source"
+	default:
+		// wiki/<wing>/<hall>/<slug>.md
+		e.Type = "hall"
+		e.Wing = parts[1]
+		if len(parts) >= 4 {
+			e.Hall = parts[2]
+		}
+	}
+}
+
+// readFrontmatter pulls title/wing/hall from a YAML frontmatter block.
+// Frontmatter is optional; missing fields leave the entity unchanged.
+func readFrontmatter(e *Entity, content []byte) {
+	scanner := bufio.NewScanner(bytes.NewReader(content))
+	inFM := false
+	for scanner.Scan() {
+		line := scanner.Text()
+		if strings.TrimSpace(line) == "---" {
+			if !inFM {
+				inFM = true
+				continue
+			}
+			return
+		}
+		if !inFM {
+			return
+		}
+		key, val, ok := strings.Cut(line, ":")
+		if !ok {
+			continue
+		}
+		v := strings.Trim(strings.TrimSpace(val), `"'`)
+		switch strings.TrimSpace(key) {
+		case "title":
+			if e.Title == "" {
+				e.Title = v
+			}
+		case "wing":
+			if e.Wing == "" {
+				e.Wing = v
+			}
+		case "hall":
+			if e.Hall == "" {
+				e.Hall = v
+			}
+		}
+	}
+}
+
+func extractEdges(docPath, srcSlug string, content []byte) []Edge {
+	var edges []Edge
+	seen := make(map[string]struct{}) // dedupe (dst, line)
+	scanner := bufio.NewScanner(bytes.NewReader(content))
+	line := 0
+	for scanner.Scan() {
+		line++
+		matches := linkRE.FindAllStringSubmatch(scanner.Text(), -1)
+		for _, m := range matches {
+			dst := strings.TrimSpace(m[1])
+			if dst == "" || dst == srcSlug {
+				continue
+			}
+			key := dst + "|" + itoa(line)
+			if _, dup := seen[key]; dup {
+				continue
+			}
+			seen[key] = struct{}{}
+			edges = append(edges, Edge{
+				SrcDoc:   docPath,
+				SrcSlug:  srcSlug,
+				DstSlug:  dst,
+				EdgeType: "wikilink",
+				SrcLine:  line,
+			})
+		}
+	}
+	return edges
+}
+
+// itoa avoids the fmt dependency on a hot path. Single-digit fast path
+// keeps overhead negligible for typical line counts.
+func itoa(n int) string {
+	if n == 0 {
+		return "0"
+	}
+	var buf [20]byte
+	i := len(buf)
+	neg := n < 0
+	if neg {
+		n = -n
+	}
+	for n > 0 {
+		i--
+		buf[i] = byte('0' + n%10)
+		n /= 10
+	}
+	if neg {
+		i--
+		buf[i] = '-'
+	}
+	return string(buf[i:])
+}
--- a/ingestion/internal/graph/extract_test.go
+++ b/ingestion/internal/graph/extract_test.go
@@ -0,0 +1,106 @@
+package graph
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestExtract_HallDoc(t *testing.T) {
+	content := []byte(`---
+wing: jepa-fx
+hall: decisions
+title: Val Vol Decision
+---
+# Val Vol
+
+See also [[other-decision]] and [[parent-concept|Parent Concept]].
+
+Linking to [[unrelated]].
+`)
+
+	ent, edges, ok := Extract("wiki/jepa-fx/decisions/val-vol.md", content)
+	require.True(t, ok)
+	assert.Equal(t, "val-vol", ent.Slug)
+	assert.Equal(t, "hall", ent.Type)
+	assert.Equal(t, "jepa-fx", ent.Wing)
+	assert.Equal(t, "decisions", ent.Hall)
+	assert.Equal(t, "Val Vol Decision", ent.Title)
+
+	require.Len(t, edges, 3)
+	assert.Equal(t, "other-decision", edges[0].DstSlug)
+	assert.Equal(t, "parent-concept", edges[1].DstSlug)
+	assert.Equal(t, "unrelated", edges[2].DstSlug)
+	for _, e := range edges {
+		assert.Equal(t, "wikilink", e.EdgeType)
+		assert.Equal(t, "val-vol", e.SrcSlug)
+		assert.Equal(t, "wiki/jepa-fx/decisions/val-vol.md", e.SrcDoc)
+		assert.Greater(t, e.SrcLine, 0)
+	}
+}
+
+func TestExtract_LegacyConceptDoc(t *testing.T) {
+	content := []byte(`---
+title: Hash Encoding
+---
+# Hash Encoding
+
+Linked to [[financial-sentiment-analysis|FSA]].
+`)
+	ent, edges, ok := Extract("wiki/concepts/hash-encoding.md", content)
+	require.True(t, ok)
+	assert.Equal(t, "hash-encoding", ent.Slug)
+	assert.Equal(t, "concept", ent.Type)
+	assert.Empty(t, ent.Wing)
+	assert.Empty(t, ent.Hall)
+	assert.Equal(t, "Hash Encoding", ent.Title)
+
+	require.Len(t, edges, 1)
+	assert.Equal(t, "financial-sentiment-analysis", edges[0].DstSlug)
+}
+
+func TestExtract_KnowledgeDoc(t *testing.T) {
+	content := []byte("# No frontmatter, no links here.\n")
+	ent, edges, ok := Extract("knowledge/some-note.md", content)
+	require.True(t, ok)
+	assert.Equal(t, "some-note", ent.Slug)
+	assert.Equal(t, "knowledge", ent.Type)
+	assert.Empty(t, edges)
+}
+
+func TestExtract_DedupesRepeatedLinkOnSameLine(t *testing.T) {
+	content := []byte("See [[foo]] and [[foo]] again on the same line.\n")
+	_, edges, ok := Extract("knowledge/dup.md", content)
+	require.True(t, ok)
+	require.Len(t, edges, 1)
+	assert.Equal(t, "foo", edges[0].DstSlug)
+}
+
+func TestExtract_KeepsMultipleEdgesOnDifferentLines(t *testing.T) {
+	content := []byte("First mention [[foo]].\n\nSecond mention [[foo]].\n")
+	_, edges, ok := Extract("knowledge/multi.md", content)
+	require.True(t, ok)
+	require.Len(t, edges, 2)
+	assert.NotEqual(t, edges[0].SrcLine, edges[1].SrcLine)
+}
+
+func TestExtract_IgnoresSelfLinks(t *testing.T) {
+	content := []byte("Self-reference [[self]] should be ignored.\n")
+	_, edges, ok := Extract("knowledge/self.md", content)
+	require.True(t, ok)
+	assert.Empty(t, edges)
+}
+
+func TestExtract_RejectsNonMarkdown(t *testing.T) {
+	_, _, ok := Extract("wiki/concepts/not-markdown.txt", []byte("anything"))
+	assert.False(t, ok)
+}
+
+func TestExtract_LineNumbersAre1Indexed(t *testing.T) {
+	content := []byte("line 1\nline 2 [[bar]]\n")
+	_, edges, ok := Extract("knowledge/lines.md", content)
+	require.True(t, ok)
+	require.Len(t, edges, 1)
+	assert.Equal(t, 2, edges[0].SrcLine)
+}