classifyByPath had a hole: paths like wiki/index.md or wiki/<slug>.md (direct children of wiki/, no subdirectory) hit the default branch and wrote Wing=parts[1] — which IS the filename, not a wing. Symptom in brain_entities: rows like (slug=index, wing=index.md) and (slug=autobe-..., wing=autobe-evaluation-pattern-....md). Fix: when len(parts) < 3 (no subdirectory at all), fall through to Type=knowledge and let frontmatter set wing/hall if present. Add brain/eval/ artifacts at the same time: - qa-2026-05.md — 20 hand-authored Q→expected-slug pairs covering the homelab knowledge corpus across mcp, dex, gitops, postgres, go, models, methodology - score.py — calls brain_query for each pair, scores top-1 + top-3, emits per-question detail. BRAIN_MCP_TOKEN via env. Pre-fix baseline against the live brain: top-1 = 20% (4/20), top-3 = 65% (13/20). Six hard misses where the expected slug doesn't even land in the top-5. Used to gate the phase 2 DIKW redesign (infra#62 follow-up): if phase 1 fixes (this parser fix + 20 backlink authoring on top orphans) lift top-1 by <10 absolute points, structure is the bottleneck and the tier redesign is justified.
212 lines
5.6 KiB
Go
212 lines
5.6 KiB
Go
// Package graph extracts entity + edge records from brain markdown
|
|
// documents for the brain_entities / brain_edges relational graph.
|
|
//
|
|
// The extractor is pure: it takes markdown bytes and a document path and
|
|
// returns the entity (one per doc) and the wikilink edges (zero or more)
|
|
// it found, with source line numbers so the graph store can record
|
|
// provenance.
|
|
//
|
|
// Edge types in v1: only "wikilink" — derived from [[slug]] and
|
|
// [[slug|Display]] occurrences in the body. Section-header edges are
|
|
// deferred (see infra#62 grill addendum).
|
|
package graph
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
// Entity represents one brain document for graph indexing.
|
|
//
|
|
// Slug is the basename without ".md" — the same identity used by
|
|
// wiki canonicalization and the wikilink target syntax.
|
|
//
|
|
// Type categorises the doc into a coarse bucket so callers can filter
|
|
// graph traversals (e.g. "only entity nodes"). When the doc lives
|
|
// under brain/wiki/<wing>/<hall>/, Wing and Hall capture the
|
|
// taxonomy; otherwise they're empty (legacy brain/knowledge/ docs).
|
|
type Entity struct {
|
|
DocPath string // forward-slash, relative to brainDir
|
|
Slug string
|
|
Type string // "concept" | "entity" | "source" | "hall" | "knowledge"
|
|
Wing string // optional; from frontmatter or path
|
|
Hall string // optional; from frontmatter or path
|
|
Title string // optional; from frontmatter
|
|
}
|
|
|
|
// Edge represents a directed relationship between two slugs.
|
|
//
|
|
// SrcLine is the 1-indexed line in the source document where the link
|
|
// was found, so callers can re-find the linking text after an edit.
|
|
type Edge struct {
|
|
SrcDoc string // forward-slash, relative to brainDir
|
|
SrcSlug string // == Entity.Slug for SrcDoc
|
|
DstSlug string
|
|
EdgeType string // "wikilink" in v1
|
|
SrcLine int // 1-indexed
|
|
}
|
|
|
|
// linkRE matches both [[slug]] and [[slug|Display Name]] wikilinks.
|
|
// Group 1 is the slug; group 2 (if present) is the display.
|
|
var linkRE = regexp.MustCompile(`\[\[([^\]|]+)(?:\|([^\]]+))?\]\]`)
|
|
|
|
// Extract parses one markdown document and returns its Entity plus the
|
|
// outgoing wikilink Edges. docPath is forward-slash, relative to
|
|
// brainDir; content is the raw markdown bytes.
|
|
//
|
|
// Returns ok=false when docPath does not yield a usable slug (e.g.
|
|
// non-markdown file slipped through).
|
|
func Extract(docPath string, content []byte) (Entity, []Edge, bool) {
|
|
slug := slugFromPath(docPath)
|
|
if slug == "" {
|
|
return Entity{}, nil, false
|
|
}
|
|
ent := Entity{DocPath: docPath, Slug: slug}
|
|
classifyByPath(&ent, docPath)
|
|
readFrontmatter(&ent, content)
|
|
|
|
edges := extractEdges(docPath, slug, content)
|
|
return ent, edges, true
|
|
}
|
|
|
|
func slugFromPath(docPath string) string {
|
|
base := filepath.Base(docPath)
|
|
if !strings.HasSuffix(base, ".md") {
|
|
return ""
|
|
}
|
|
return strings.TrimSuffix(base, ".md")
|
|
}
|
|
|
|
// classifyByPath fills Type / Wing / Hall from the path layout when the
|
|
// doc lives under brain/wiki/. Layout: wiki/<wing>/<hall>/<slug>.md
|
|
// or wiki/<bucket>/<slug>.md for the legacy concept/entity/source dirs.
|
|
//
|
|
// Files directly under wiki/ (no subdirectory — e.g. wiki/index.md) used
|
|
// to incorrectly land Type="hall" Wing="index.md" because the path's
|
|
// second segment was the file itself. Now they fall through to Type
|
|
// "knowledge" and leave wing/hall to frontmatter.
|
|
func classifyByPath(e *Entity, docPath string) {
|
|
parts := strings.Split(docPath, "/")
|
|
if len(parts) < 2 || parts[0] != "wiki" {
|
|
e.Type = "knowledge"
|
|
return
|
|
}
|
|
if len(parts) < 3 {
|
|
// wiki/<slug>.md — no subdirectory. Treat as plain knowledge
|
|
// and let frontmatter set wing/hall if they're present.
|
|
e.Type = "knowledge"
|
|
return
|
|
}
|
|
switch parts[1] {
|
|
case "concepts":
|
|
e.Type = "concept"
|
|
case "entities":
|
|
e.Type = "entity"
|
|
case "sources":
|
|
e.Type = "source"
|
|
default:
|
|
// wiki/<wing>/<hall>/<slug>.md
|
|
e.Type = "hall"
|
|
e.Wing = parts[1]
|
|
if len(parts) >= 4 {
|
|
e.Hall = parts[2]
|
|
}
|
|
}
|
|
}
|
|
|
|
// readFrontmatter pulls title/wing/hall from a YAML frontmatter block.
|
|
// Frontmatter is optional; missing fields leave the entity unchanged.
|
|
func readFrontmatter(e *Entity, content []byte) {
|
|
scanner := bufio.NewScanner(bytes.NewReader(content))
|
|
inFM := false
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
if strings.TrimSpace(line) == "---" {
|
|
if !inFM {
|
|
inFM = true
|
|
continue
|
|
}
|
|
return
|
|
}
|
|
if !inFM {
|
|
return
|
|
}
|
|
key, val, ok := strings.Cut(line, ":")
|
|
if !ok {
|
|
continue
|
|
}
|
|
v := strings.Trim(strings.TrimSpace(val), `"'`)
|
|
switch strings.TrimSpace(key) {
|
|
case "title":
|
|
if e.Title == "" {
|
|
e.Title = v
|
|
}
|
|
case "wing":
|
|
if e.Wing == "" {
|
|
e.Wing = v
|
|
}
|
|
case "hall":
|
|
if e.Hall == "" {
|
|
e.Hall = v
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func extractEdges(docPath, srcSlug string, content []byte) []Edge {
|
|
var edges []Edge
|
|
seen := make(map[string]struct{}) // dedupe (dst, line)
|
|
scanner := bufio.NewScanner(bytes.NewReader(content))
|
|
line := 0
|
|
for scanner.Scan() {
|
|
line++
|
|
matches := linkRE.FindAllStringSubmatch(scanner.Text(), -1)
|
|
for _, m := range matches {
|
|
dst := strings.TrimSpace(m[1])
|
|
if dst == "" || dst == srcSlug {
|
|
continue
|
|
}
|
|
key := dst + "|" + itoa(line)
|
|
if _, dup := seen[key]; dup {
|
|
continue
|
|
}
|
|
seen[key] = struct{}{}
|
|
edges = append(edges, Edge{
|
|
SrcDoc: docPath,
|
|
SrcSlug: srcSlug,
|
|
DstSlug: dst,
|
|
EdgeType: "wikilink",
|
|
SrcLine: line,
|
|
})
|
|
}
|
|
}
|
|
return edges
|
|
}
|
|
|
|
// itoa avoids the fmt dependency on a hot path. Single-digit fast path
|
|
// keeps overhead negligible for typical line counts.
|
|
func itoa(n int) string {
|
|
if n == 0 {
|
|
return "0"
|
|
}
|
|
var buf [20]byte
|
|
i := len(buf)
|
|
neg := n < 0
|
|
if neg {
|
|
n = -n
|
|
}
|
|
for n > 0 {
|
|
i--
|
|
buf[i] = byte('0' + n%10)
|
|
n /= 10
|
|
}
|
|
if neg {
|
|
i--
|
|
buf[i] = '-'
|
|
}
|
|
return string(buf[i:])
|
|
}
|