hyperguild/ingestion/internal/graph/extract.go

// Package graph extracts entity + edge records from brain markdown
// documents for the brain_entities / brain_edges relational graph.
//
// The extractor is pure: it takes markdown bytes and a document path and
// returns the entity (one per doc) and the wikilink edges (zero or more)
// it found, with source line numbers so the graph store can record
// provenance.
//
// Edge types in v1: only "wikilink" — derived from [[slug]] and
// [[slug|Display]] occurrences in the body. Section-header edges are
// deferred (see infra#62 grill addendum).
package graph

import (
	"bufio"
	"bytes"
	"path/filepath"
	"regexp"
	"strings"
)

// Entity represents one brain document for graph indexing.
//
// Slug is the basename without ".md" — the same identity used by
// wiki canonicalization and the wikilink target syntax.
//
// Type categorises the doc into a coarse bucket so callers can filter
// graph traversals (e.g. "only entity nodes"). When the doc lives
// under brain/wiki/<wing>/<hall>/, Wing and Hall capture the
// taxonomy; otherwise they're empty (legacy brain/knowledge/ docs).
type Entity struct {
	DocPath string // forward-slash, relative to brainDir
	Slug    string
	Type    string // "concept" | "entity" | "source" | "hall" | "knowledge"
	Wing    string // optional; from frontmatter or path
	Hall    string // optional; from frontmatter or path
	Title   string // optional; from frontmatter
	// DIKW tier — infra#72. Empty until M3 migration writes `tier:`
	// frontmatter to every entry. Path-inferred tier kicks in as a
	// fallback so the column populates immediately on backfill even
	// for entries that haven't had their frontmatter rewritten yet.
	Tier  string // "inbox" | "note" | "knowledge"
	Topic string // kebab-slug; the thing the entry is about
}

// Edge represents a directed relationship between two slugs.
//
// SrcLine is the 1-indexed line in the source document where the link
// was found, so callers can re-find the linking text after an edit.
type Edge struct {
	SrcDoc   string // forward-slash, relative to brainDir
	SrcSlug  string // == Entity.Slug for SrcDoc
	DstSlug  string
	EdgeType string // "wikilink" in v1
	SrcLine  int    // 1-indexed
}

// linkRE matches both [[slug]] and [[slug|Display Name]] wikilinks.
// Group 1 is the slug; group 2 (if present) is the display.
var linkRE = regexp.MustCompile(`\[\[([^\]|]+)(?:\|([^\]]+))?\]\]`)

// Extract parses one markdown document and returns its Entity plus the
// outgoing wikilink Edges. docPath is forward-slash, relative to
// brainDir; content is the raw markdown bytes.
//
// Returns ok=false when docPath does not yield a usable slug (e.g.
// non-markdown file slipped through).
func Extract(docPath string, content []byte) (Entity, []Edge, bool) {
	slug := slugFromPath(docPath)
	if slug == "" {
		return Entity{}, nil, false
	}
	ent := Entity{DocPath: docPath, Slug: slug}
	classifyByPath(&ent, docPath)
	readFrontmatter(&ent, content)
	inferTierFromPath(&ent, docPath)

	edges := extractEdges(docPath, slug, content)
	return ent, edges, true
}

// inferTierFromPath fills Tier when frontmatter didn't already set it.
// The new layout has dedicated subtrees per tier; pre-migration paths
// (knowledge/, wiki/, raw/, sessions/) get their best-guess mapping so
// the column populates on backfill before the M3 file moves run.
func inferTierFromPath(e *Entity, docPath string) {
	if e.Tier != "" {
		return
	}
	parts := strings.Split(docPath, "/")
	if len(parts) == 0 {
		return
	}
	switch parts[0] {
	case "inbox":
		e.Tier = "inbox"
	case "notes":
		e.Tier = "note"
	case "knowledge":
		e.Tier = "knowledge"
	case "wiki":
		// Pre-M3 wiki layout. Most subdirs are I-level:
		//   wiki/sources/  — synth summaries of raw inbox material
		//   wiki/concepts/ — definitions, not lessons
		// One exception: wiki/entities/ holds anchor facts about
		// concrete things (models, services, people) that the eval
		// expects to surface when queried directly. Those map to K
		// to match the post-M3 layout target (knowledge/facts/).
		if len(parts) >= 2 && parts[1] == "entities" {
			e.Tier = "knowledge"
		} else {
			e.Tier = "note"
		}
	case "raw", "sessions", "clips":
		e.Tier = "inbox"
	}
}

func slugFromPath(docPath string) string {
	base := filepath.Base(docPath)
	if !strings.HasSuffix(base, ".md") {
		return ""
	}
	return strings.TrimSuffix(base, ".md")
}

// classifyByPath fills Type / Wing / Hall from the path layout when the
// doc lives under brain/wiki/. Layout: wiki/<wing>/<hall>/<slug>.md
// or wiki/<bucket>/<slug>.md for the legacy concept/entity/source dirs.
//
// Files directly under wiki/ (no subdirectory — e.g. wiki/index.md) used
// to incorrectly land Type="hall" Wing="index.md" because the path's
// second segment was the file itself. Now they fall through to Type
// "knowledge" and leave wing/hall to frontmatter.
func classifyByPath(e *Entity, docPath string) {
	parts := strings.Split(docPath, "/")
	if len(parts) < 2 || parts[0] != "wiki" {
		e.Type = "knowledge"
		return
	}
	if len(parts) < 3 {
		// wiki/<slug>.md — no subdirectory. Treat as plain knowledge
		// and let frontmatter set wing/hall if they're present.
		e.Type = "knowledge"
		return
	}
	switch parts[1] {
	case "concepts":
		e.Type = "concept"
	case "entities":
		e.Type = "entity"
	case "sources":
		e.Type = "source"
	default:
		// wiki/<wing>/<hall>/<slug>.md
		e.Type = "hall"
		e.Wing = parts[1]
		if len(parts) >= 4 {
			e.Hall = parts[2]
		}
	}
}

// readFrontmatter pulls title/wing/hall from a YAML frontmatter block.
// Frontmatter is optional; missing fields leave the entity unchanged.
func readFrontmatter(e *Entity, content []byte) {
	scanner := bufio.NewScanner(bytes.NewReader(content))
	inFM := false
	for scanner.Scan() {
		line := scanner.Text()
		if strings.TrimSpace(line) == "---" {
			if !inFM {
				inFM = true
				continue
			}
			return
		}
		if !inFM {
			return
		}
		key, val, ok := strings.Cut(line, ":")
		if !ok {
			continue
		}
		v := strings.Trim(strings.TrimSpace(val), `"'`)
		switch strings.TrimSpace(key) {
		case "title":
			if e.Title == "" {
				e.Title = v
			}
		case "wing":
			if e.Wing == "" {
				e.Wing = v
			}
		case "hall":
			if e.Hall == "" {
				e.Hall = v
			}
		case "tier":
			if e.Tier == "" {
				e.Tier = v
			}
		case "topic":
			if e.Topic == "" {
				e.Topic = v
			}
		}
	}
}

func extractEdges(docPath, srcSlug string, content []byte) []Edge {
	var edges []Edge
	seen := make(map[string]struct{}) // dedupe (dst, line)
	scanner := bufio.NewScanner(bytes.NewReader(content))
	line := 0
	for scanner.Scan() {
		line++
		matches := linkRE.FindAllStringSubmatch(scanner.Text(), -1)
		for _, m := range matches {
			dst := strings.TrimSpace(m[1])
			if dst == "" || dst == srcSlug {
				continue
			}
			key := dst + "|" + itoa(line)
			if _, dup := seen[key]; dup {
				continue
			}
			seen[key] = struct{}{}
			edges = append(edges, Edge{
				SrcDoc:   docPath,
				SrcSlug:  srcSlug,
				DstSlug:  dst,
				EdgeType: "wikilink",
				SrcLine:  line,
			})
		}
	}
	return edges
}

// itoa avoids the fmt dependency on a hot path. Single-digit fast path
// keeps overhead negligible for typical line counts.
func itoa(n int) string {
	if n == 0 {
		return "0"
	}
	var buf [20]byte
	i := len(buf)
	neg := n < 0
	if neg {
		n = -n
	}
	for n > 0 {
		i--
		buf[i] = byte('0' + n%10)
		n /= 10
	}
	if neg {
		i--
		buf[i] = '-'
	}
	return string(buf[i:])
}