feat(graph): add brain_entities + brain_edges store and wikilink parser
All checks were successful
CI / Lint / Test / Vet (push) Successful in 12s
CI / Mirror to GitHub (push) Successful in 3s

Foundation for Track A (GraphRAG on top of existing wiki). Two new
packages, both unwired — service behaviour unchanged until commit 2
hooks the pipeline.

- internal/graph: pure parser. Extract() walks markdown + frontmatter
  and emits one Entity + N wikilink Edges per doc. Dedupes per (dst,
  line), ignores self-references, classifies hall/concept/entity/
  source/knowledge from path layout.

- internal/graphstore: pgx-backed PGStore mirroring vectorstore's
  shape. Idempotent Init() creates brain_entities + brain_edges with
  indexes on src_slug, dst_slug, src_doc, wing, type. Operations:
  UpsertEntity, ReplaceEdgesForDoc (tx), DeleteByDoc, Neighbors,
  Subgraph (recursive CTE, depth ≤6), Path (shortest path, depth ≤8).

Schema lives on the shared postgres18 instance alongside the
brain_embeddings table — no new datastore. See
docs/superpowers/specs/2026-05-homelab-training-graph-next-step.md
in infra repo + infra#62.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mathias
2026-05-23 15:18:08 +02:00
parent c153e9105c
commit f53ee18cb6
3 changed files with 657 additions and 0 deletions

View File

@@ -0,0 +1,200 @@
// Package graph extracts entity + edge records from brain markdown
// documents for the brain_entities / brain_edges relational graph.
//
// The extractor is pure: it takes markdown bytes and a document path and
// returns the entity (one per doc) and the wikilink edges (zero or more)
// it found, with source line numbers so the graph store can record
// provenance.
//
// Edge types in v1: only "wikilink" — derived from [[slug]] and
// [[slug|Display]] occurrences in the body. Section-header edges are
// deferred (see infra#62 grill addendum).
package graph
import (
"bufio"
"bytes"
"path/filepath"
"regexp"
"strings"
)
// Entity represents one brain document for graph indexing.
//
// Slug is the basename without ".md" — the same identity used by
// wiki canonicalization and the wikilink target syntax.
//
// Type categorises the doc into a coarse bucket so callers can filter
// graph traversals (e.g. "only entity nodes"). When the doc lives
// under brain/wiki/<wing>/<hall>/, Wing and Hall capture the
// taxonomy; otherwise they're empty (legacy brain/knowledge/ docs).
type Entity struct {
DocPath string // forward-slash, relative to brainDir
Slug string
Type string // "concept" | "entity" | "source" | "hall" | "knowledge"
Wing string // optional; from frontmatter or path
Hall string // optional; from frontmatter or path
Title string // optional; from frontmatter
}
// Edge represents a directed relationship between two slugs.
//
// SrcLine is the 1-indexed line in the source document where the link
// was found, so callers can re-find the linking text after an edit.
type Edge struct {
SrcDoc string // forward-slash, relative to brainDir
SrcSlug string // == Entity.Slug for SrcDoc
DstSlug string
EdgeType string // "wikilink" in v1
SrcLine int // 1-indexed
}
// linkRE matches both [[slug]] and [[slug|Display Name]] wikilinks.
// Group 1 is the slug; group 2 (if present) is the display.
var linkRE = regexp.MustCompile(`\[\[([^\]|]+)(?:\|([^\]]+))?\]\]`)
// Extract parses one markdown document and returns its Entity plus the
// outgoing wikilink Edges. docPath is forward-slash, relative to
// brainDir; content is the raw markdown bytes.
//
// Returns ok=false when docPath does not yield a usable slug (e.g.
// non-markdown file slipped through).
func Extract(docPath string, content []byte) (Entity, []Edge, bool) {
slug := slugFromPath(docPath)
if slug == "" {
return Entity{}, nil, false
}
ent := Entity{DocPath: docPath, Slug: slug}
classifyByPath(&ent, docPath)
readFrontmatter(&ent, content)
edges := extractEdges(docPath, slug, content)
return ent, edges, true
}
func slugFromPath(docPath string) string {
base := filepath.Base(docPath)
if !strings.HasSuffix(base, ".md") {
return ""
}
return strings.TrimSuffix(base, ".md")
}
// classifyByPath fills Type / Wing / Hall from the path layout when the
// doc lives under brain/wiki/. Layout: wiki/<wing>/<hall>/<slug>.md
// or wiki/<bucket>/<slug>.md for the legacy concept/entity/source dirs.
func classifyByPath(e *Entity, docPath string) {
parts := strings.Split(docPath, "/")
if len(parts) < 2 || parts[0] != "wiki" {
e.Type = "knowledge"
return
}
switch parts[1] {
case "concepts":
e.Type = "concept"
case "entities":
e.Type = "entity"
case "sources":
e.Type = "source"
default:
// wiki/<wing>/<hall>/<slug>.md
e.Type = "hall"
e.Wing = parts[1]
if len(parts) >= 4 {
e.Hall = parts[2]
}
}
}
// readFrontmatter pulls title/wing/hall from a YAML frontmatter block.
// Frontmatter is optional; missing fields leave the entity unchanged.
func readFrontmatter(e *Entity, content []byte) {
scanner := bufio.NewScanner(bytes.NewReader(content))
inFM := false
for scanner.Scan() {
line := scanner.Text()
if strings.TrimSpace(line) == "---" {
if !inFM {
inFM = true
continue
}
return
}
if !inFM {
return
}
key, val, ok := strings.Cut(line, ":")
if !ok {
continue
}
v := strings.Trim(strings.TrimSpace(val), `"'`)
switch strings.TrimSpace(key) {
case "title":
if e.Title == "" {
e.Title = v
}
case "wing":
if e.Wing == "" {
e.Wing = v
}
case "hall":
if e.Hall == "" {
e.Hall = v
}
}
}
}
func extractEdges(docPath, srcSlug string, content []byte) []Edge {
var edges []Edge
seen := make(map[string]struct{}) // dedupe (dst, line)
scanner := bufio.NewScanner(bytes.NewReader(content))
line := 0
for scanner.Scan() {
line++
matches := linkRE.FindAllStringSubmatch(scanner.Text(), -1)
for _, m := range matches {
dst := strings.TrimSpace(m[1])
if dst == "" || dst == srcSlug {
continue
}
key := dst + "|" + itoa(line)
if _, dup := seen[key]; dup {
continue
}
seen[key] = struct{}{}
edges = append(edges, Edge{
SrcDoc: docPath,
SrcSlug: srcSlug,
DstSlug: dst,
EdgeType: "wikilink",
SrcLine: line,
})
}
}
return edges
}
// itoa avoids the fmt dependency on a hot path. Single-digit fast path
// keeps overhead negligible for typical line counts.
func itoa(n int) string {
if n == 0 {
return "0"
}
var buf [20]byte
i := len(buf)
neg := n < 0
if neg {
n = -n
}
for n > 0 {
i--
buf[i] = byte('0' + n%10)
n /= 10
}
if neg {
i--
buf[i] = '-'
}
return string(buf[i:])
}