fix(graph): route wiki/<flat>.md to Type=knowledge, not Type=hall with filename-as-wing

classifyByPath had a hole: paths like wiki/index.md or wiki/<slug>.md (direct children of wiki/, no subdirectory) hit the default branch and wrote Wing=parts[1] — which IS the filename, not a wing. Symptom in brain_entities: rows like (slug=index, wing=index.md) and (slug=autobe-..., wing=autobe-evaluation-pattern-....md). Fix: when len(parts) < 3 (no subdirectory at all), fall through to Type=knowledge and let frontmatter set wing/hall if present. Add brain/eval/ artifacts at the same time: - qa-2026-05.md — 20 hand-authored Q→expected-slug pairs covering the homelab knowledge corpus across mcp, dex, gitops, postgres, go, models, methodology - score.py — calls brain_query for each pair, scores top-1 + top-3, emits per-question detail. BRAIN_MCP_TOKEN via env. Pre-fix baseline against the live brain: top-1 = 20% (4/20), top-3 = 65% (13/20). Six hard misses where the expected slug doesn't even land in the top-5. Used to gate the phase 2 DIKW redesign (infra#62 follow-up): if phase 1 fixes (this parser fix + 20 backlink authoring on top orphans) lift top-1 by <10 absolute points, structure is the bottleneck and the tier redesign is justified.
2026-05-24 22:33:04 +02:00
parent 72be87b4e7
commit 3084c4173d
5 changed files with 413 additions and 0 deletions
--- a/brain/eval/score.py
+++ b/brain/eval/score.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""Score brain_query against the qa-2026-05.md eval set.
+
+Reads `q:` / `expected:` pairs, calls brain_query MCP for each, records
+top-1 + top-3 hit rate. Run:
+
+    BRAIN_MCP_TOKEN=$(grep '^export BRAIN_MCP_TOKEN=' ~/.llmkeys | cut -d= -f2-) \\
+      python3 score.py qa-2026-05.md
+
+Optionally pass --baseline <name> to save the result as a labeled run.
+"""
+import argparse
+import json
+import os
+import re
+import sys
+import time
+import urllib.request
+
+ENDPOINT = "https://brain-mcp.d-ma.be/mcp"
+
+
+def load_pairs(path):
+    pairs = []
+    q = None
+    with open(path) as f:
+        for line in f:
+            line = line.rstrip()
+            if line.startswith("q:"):
+                q = line[2:].strip()
+            elif line.startswith("expected:") and q is not None:
+                expected = line[len("expected:"):].strip()
+                pairs.append((q, expected))
+                q = None
+    return pairs
+
+
+def brain_query(token, query, k=5):
+    body = json.dumps({
+        "jsonrpc": "2.0",
+        "id": 1,
+        "method": "tools/call",
+        "params": {"name": "brain_query", "arguments": {"query": query, "k": k}},
+    }).encode()
+    req = urllib.request.Request(
+        ENDPOINT,
+        data=body,
+        headers={
+            "Authorization": f"Bearer {token}",
+            "Content-Type": "application/json",
+            "Accept": "application/json, text/event-stream",
+        },
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=30) as r:
+        raw = r.read().decode()
+    for line in raw.splitlines():
+        if line.startswith("data:"):
+            raw = line[5:].strip()
+            break
+    d = json.loads(raw)
+    if "error" in d:
+        raise RuntimeError(d["error"])
+    text = d["result"]["content"][0]["text"]
+    return json.loads(text).get("results", [])
+
+
+def slug_of(result):
+    # `title` mirrors the slug in brain_entities for normal entries.
+    # Fall back to basename(path) if title is missing.
+    t = result.get("title", "")
+    if t:
+        return t
+    p = result.get("path", "")
+    return re.sub(r"\.md$", "", os.path.basename(p))
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("evalset")
+    ap.add_argument("--baseline", default="run")
+    ap.add_argument("--k", type=int, default=5)
+    args = ap.parse_args()
+
+    token = os.environ.get("BRAIN_MCP_TOKEN")
+    if not token:
+        sys.exit("BRAIN_MCP_TOKEN not set")
+
+    pairs = load_pairs(args.evalset)
+    if not pairs:
+        sys.exit(f"no pairs in {args.evalset}")
+
+    print(f"# {args.baseline} — {len(pairs)} questions, k={args.k}")
+    print()
+    hits1 = 0
+    hits3 = 0
+    detail = []
+    for q, expected in pairs:
+        try:
+            results = brain_query(token, q, k=args.k)
+        except Exception as e:
+            detail.append((q, expected, [], f"ERR {e}"))
+            continue
+        slugs = [slug_of(r) for r in results]
+        rank = slugs.index(expected) + 1 if expected in slugs else 0
+        h1 = 1 if rank == 1 else 0
+        h3 = 1 if 0 < rank <= 3 else 0
+        hits1 += h1
+        hits3 += h3
+        detail.append((q, expected, slugs, rank))
+
+    total = len(pairs)
+    print(f"top-1 hit rate: {hits1}/{total} = {100*hits1/total:.0f}%")
+    print(f"top-3 hit rate: {hits3}/{total} = {100*hits3/total:.0f}%")
+    print()
+    print("## per-question detail")
+    print()
+    for q, expected, slugs, rank in detail:
+        marker = {0: "✗", 1: "★", 2: "·", 3: "·"}.get(rank, "?")
+        if isinstance(rank, str):
+            marker = "!"
+        print(f"{marker} rank={rank}  expected={expected}")
+        print(f"     q: {q}")
+        for i, s in enumerate(slugs[:args.k], 1):
+            mark = "  <-- expected" if s == expected else ""
+            print(f"     {i}. {s}{mark}")
+        print()
+
+
+if __name__ == "__main__":
+    main()