diff --git a/brain/eval/baseline-pre-fix.txt b/brain/eval/baseline-pre-fix.txt new file mode 100644 index 0000000..a9a96bb --- /dev/null +++ b/brain/eval/baseline-pre-fix.txt @@ -0,0 +1,167 @@ +# baseline-pre-fix — 20 questions, k=5 + +top-1 hit rate: 4/20 = 20% +top-3 hit rate: 13/20 = 65% + +## per-question detail + +· rank=3 expected=dex-in-memory-storage-wipes-oauth-tokens-on-every-pod-restart + q: how do I stop dex from logging users out on every pod restart? + 1. homelab-network-perimeter-model + 2. 2026-05-12-koala-machine-state + 3. dex-in-memory-storage-wipes-oauth-tokens-on-every-pod-restart <-- expected + 4. infra-litellm-absorption-2026-05-16 + 5. Financial Sentiment Analysis on Stock Market Headlines With FinBERT & HuggingFace + +★ rank=1 expected=postgres-least-privilege-migration-tenant-grant-bypass-2026-05 + q: my postgres-exporter broke after revoking PUBLIC CONNECT — why? + 1. postgres-least-privilege-migration-tenant-grant-bypass-2026-05 <-- expected + 2. infra-litellm-absorption-2026-05-16 + 3. brain-mcp-activation-runbook + 4. extension-version-lags-platform-major-upgrade + 5. ntfy-deny-all-rollout-ordering-keep-alert-pipeline-live-during-auth-flip + +★ rank=1 expected=homelab-network-perimeter-model + q: when is a NodePort acceptable vs needing a public ingress with bearer gate? + 1. homelab-network-perimeter-model <-- expected + 2. qwen3-thinking-model-empty-content-trap + 3. mcpclient-empty-token-silent-401-envfrom-missing-key + 4. 2026-05-12-koala-machine-state + 5. koala-llama-swap-native-tool-calls-survey-2026-05 + +· rank=3 expected=exit-255-unknown-reason-not-oom + q: what does container exit code 255 with reason Unknown mean? + 1. qwen3-thinking-model-empty-content-trap + 2. infra-litellm-absorption-2026-05-16 + 3. exit-255-unknown-reason-not-oom <-- expected + 4. mcpclient-empty-token-silent-401-envfrom-missing-key + 5. koala-llama-swap-native-tool-calls-survey-2026-05 + +· rank=3 expected=gitea-push-mirror-cannot-create-remote-repo-needs-pre-existing-github-repo + q: can gitea push-mirror create the github repo automatically? + 1. infra-litellm-absorption-2026-05-16 + 2. Autoresearch + 3. gitea-push-mirror-cannot-create-remote-repo-needs-pre-existing-github-repo <-- expected + 4. adr-new-project-gitea-first-github-mirror + 5. adr-github-as-primary-remote + +✗ rank=0 expected=flux-healthcheck-stale-on-resource-removal + q: a flux kustomization is stuck after I removed a resource — why? + 1. qwen3-thinking-model-empty-content-trap + 2. 2026-05-12-koala-machine-state + 3. homelab-architecture-principles-2026-05 + 4. gitea-mcp: full stack shipped end-to-end (2026-05-05) + 5. k8s-configmap-mount-no-reload-needs-pod-restart + +· rank=2 expected=go-bytes-buffer-bytes-reset-aliasing-trap + q: the bytes buffer aliasing trap with Reset in a loop — what's the bug? + 1. Financial Sentiment Analysis on Stock Market Headlines With FinBERT & HuggingFace + 2. go-bytes-buffer-bytes-reset-aliasing-trap <-- expected + 3. homelab-security-chains-not-bugs + 4. training-on-rtx-5070-pretraining-vs-finetuning + 5. Hash Encoding + +★ rank=1 expected=homelab-architecture-principles-2026-05 + q: what are the homelab architecture principles from may 2026? + 1. homelab-architecture-principles-2026-05 <-- expected + 2. homelab-network-perimeter-model + 3. Claude Managed Agents — architecture notes relevant to homelab agent platform + 4. homelab-core-glossary + 5. 2026-05-12-koala-machine-state + +✗ rank=0 expected=2026-05-04-sops-age-key-from-flux-cluster + q: where does the sops age private key live in the cluster? + 1. 2026-05-12-koala-machine-state + 2. homelab-network-perimeter-model + 3. postgres-least-privilege-migration-tenant-grant-bypass-2026-05 + 4. brain-mcp-activation-runbook + 5. dex-in-memory-storage-wipes-oauth-tokens-on-every-pod-restart + +✗ rank=0 expected=grafana-dashboards-as-code-not-ui-state + q: why do my grafana dashboards disappear after a pod restart? + 1. infra-litellm-absorption-2026-05-16 + 2. 2026-05-12-koala-machine-state + 3. Financial Sentiment Analysis on Stock Market Headlines With FinBERT & HuggingFace + 4. brain-mcp-activation-runbook + 5. dex-in-memory-storage-wipes-oauth-tokens-on-every-pod-restart + +· rank=2 expected=double-diamond-methodology + q: what is the double diamond methodology? + 1. Harnessing the Power of Hash Encoding for Categorical Data in Data Science + 2. double-diamond-methodology <-- expected + 3. unified-methodology-diamond-futures-autoresearch + 4. futures-thinking-extended-double-diamond + 5. insight-exploration-as-diamond-1 + +· rank=3 expected=2026-05-04-mcp-transport-version-claude-ai-strict + q: my MCP server works from claude code but fails on claude.ai — what's different? + 1. qwen3-thinking-model-empty-content-trap + 2. mcp-resource-url-empty-breaks-claude-ai-discovery-silently + 3. 2026-05-04-mcp-transport-version-claude-ai-strict <-- expected + 4. 2026-05-04-claude-ai-custom-mcp-connectors + 5. finding-github-mcp-claudeai-vs-claudecode + +· rank=2 expected=homelab-security-chains-not-bugs + q: how should I rate security findings — isolated bugs or exploit chains? + 1. homelab-network-perimeter-model + 2. homelab-security-chains-not-bugs <-- expected + 3. Financial Sentiment Analysis on Stock Market Headlines With FinBERT & HuggingFace + 4. policy-audit-mode-blocks-nothing + 5. homelab-document-accepted-risk-to-break-audit-cycle + +· rank=2 expected=2026-05-03-canonical-vs-derived-context-flow + q: how should canonical context files relate to derived adapter files? + 1. qwen3-thinking-model-empty-content-trap + 2. 2026-05-03-canonical-vs-derived-context-flow <-- expected + 3. 2026-05-12-koala-machine-state + 4. 2026-05-04-claude-ai-custom-mcp-connectors + 5. koala-llama-swap-native-tool-calls-survey-2026-05 + +· rank=2 expected=homelab-core-glossary + q: what is the homelab core vocabulary glossary? + 1. homelab-architecture-principles-2026-05 + 2. homelab-core-glossary <-- expected + 3. Claude Managed Agents — architecture notes relevant to homelab agent platform + 4. 2026-05-12-koala-machine-state + 5. Autoresearch + +★ rank=1 expected=koala-llama-swap-native-tool-calls-survey-2026-05 + q: which models on koala llama-swap actually emit native tool_calls correctly? + 1. koala-llama-swap-native-tool-calls-survey-2026-05 <-- expected + 2. 2026-05-12-koala-machine-state + 3. infra-litellm-absorption-2026-05-16 + 4. training-on-rtx-5070-pretraining-vs-finetuning + 5. qwen3-thinking-model-empty-content-trap + +✗ rank=0 expected=qwen35-9b-fast + q: what is qwen35-9b-fast and what's it used for? + 1. koala-llama-swap-native-tool-calls-survey-2026-05 + 2. qwen3-thinking-model-empty-content-trap + 3. Qwen35-9b-fast + 4. infra-litellm-absorption-2026-05-16 + 5. 2026-05-12-koala-machine-state + +✗ rank=0 expected=go-defer-errcheck-body-close + q: in go, how do I prevent defer body close from silently dropping errors? + 1. infra-litellm-absorption-2026-05-16 + 2. homelab-network-perimeter-model + 3. go-bytes-buffer-bytes-reset-aliasing-trap + 4. mcpclient-empty-token-silent-401-envfrom-missing-key + 5. brain-mcp-activation-runbook + +✗ rank=0 expected=hyperguild-level3-pipeline-rewrite + q: what was the level 3 rewrite of hyperguild's ingestion pipeline? + 1. 2026-05-12-koala-machine-state + 2. homelab-core-glossary + 3. brain-mcp-activation-runbook + 4. koala-llama-swap-native-tool-calls-survey-2026-05 + 5. infra-litellm-absorption-2026-05-16 + +? rank=4 expected=adr-new-project-gitea-first-github-mirror + q: what's the new-project ADR — is it gitea-first or github-first? + 1. gitea-push-mirror-cannot-create-remote-repo-needs-pre-existing-github-repo + 2. gitea-mcp: full stack shipped end-to-end (2026-05-05) + 3. mcp-tool-design-get-needs-list-partner + 4. adr-new-project-gitea-first-github-mirror <-- expected + 5. 2026-05-04-gitea-mcp-build-session + diff --git a/brain/eval/qa-2026-05.md b/brain/eval/qa-2026-05.md new file mode 100644 index 0000000..f0c99d5 --- /dev/null +++ b/brain/eval/qa-2026-05.md @@ -0,0 +1,76 @@ +# Brain retrieval eval set — 2026-05-24 + +20 hand-authored Q→expected-top-1-slug pairs. Used by `score.sh` to +measure brain_query top-1 + top-3 hit rate against the live brain. + +Authoring rules: +- Each question maps to **one** clear-best entry. Avoid ambiguous + questions where multiple slugs could be the right answer. +- Questions are phrased the way a future-me would actually ask, not + the way the entry's title reads. Some lexical distance is the point. +- `expected` is the slug as stored in `brain_entities.slug`. Update + if the slug renames. + +## Pairs + +``` +q: how do I stop dex from logging users out on every pod restart? +expected: dex-in-memory-storage-wipes-oauth-tokens-on-every-pod-restart + +q: my postgres-exporter broke after revoking PUBLIC CONNECT — why? +expected: postgres-least-privilege-migration-tenant-grant-bypass-2026-05 + +q: when is a NodePort acceptable vs needing a public ingress with bearer gate? +expected: homelab-network-perimeter-model + +q: what does container exit code 255 with reason Unknown mean? +expected: exit-255-unknown-reason-not-oom + +q: can gitea push-mirror create the github repo automatically? +expected: gitea-push-mirror-cannot-create-remote-repo-needs-pre-existing-github-repo + +q: a flux kustomization is stuck after I removed a resource — why? +expected: flux-healthcheck-stale-on-resource-removal + +q: the bytes buffer aliasing trap with Reset in a loop — what's the bug? +expected: go-bytes-buffer-bytes-reset-aliasing-trap + +q: what are the homelab architecture principles from may 2026? +expected: homelab-architecture-principles-2026-05 + +q: where does the sops age private key live in the cluster? +expected: 2026-05-04-sops-age-key-from-flux-cluster + +q: why do my grafana dashboards disappear after a pod restart? +expected: grafana-dashboards-as-code-not-ui-state + +q: what is the double diamond methodology? +expected: double-diamond-methodology + +q: my MCP server works from claude code but fails on claude.ai — what's different? +expected: 2026-05-04-mcp-transport-version-claude-ai-strict + +q: how should I rate security findings — isolated bugs or exploit chains? +expected: homelab-security-chains-not-bugs + +q: how should canonical context files relate to derived adapter files? +expected: 2026-05-03-canonical-vs-derived-context-flow + +q: what is the homelab core vocabulary glossary? +expected: homelab-core-glossary + +q: which models on koala llama-swap actually emit native tool_calls correctly? +expected: koala-llama-swap-native-tool-calls-survey-2026-05 + +q: what is qwen35-9b-fast and what's it used for? +expected: qwen35-9b-fast + +q: in go, how do I prevent defer body close from silently dropping errors? +expected: go-defer-errcheck-body-close + +q: what was the level 3 rewrite of hyperguild's ingestion pipeline? +expected: hyperguild-level3-pipeline-rewrite + +q: what's the new-project ADR — is it gitea-first or github-first? +expected: adr-new-project-gitea-first-github-mirror +``` diff --git a/brain/eval/score.py b/brain/eval/score.py new file mode 100644 index 0000000..25f73c2 --- /dev/null +++ b/brain/eval/score.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +"""Score brain_query against the qa-2026-05.md eval set. + +Reads `q:` / `expected:` pairs, calls brain_query MCP for each, records +top-1 + top-3 hit rate. Run: + + BRAIN_MCP_TOKEN=$(grep '^export BRAIN_MCP_TOKEN=' ~/.llmkeys | cut -d= -f2-) \\ + python3 score.py qa-2026-05.md + +Optionally pass --baseline to save the result as a labeled run. +""" +import argparse +import json +import os +import re +import sys +import time +import urllib.request + +ENDPOINT = "https://brain-mcp.d-ma.be/mcp" + + +def load_pairs(path): + pairs = [] + q = None + with open(path) as f: + for line in f: + line = line.rstrip() + if line.startswith("q:"): + q = line[2:].strip() + elif line.startswith("expected:") and q is not None: + expected = line[len("expected:"):].strip() + pairs.append((q, expected)) + q = None + return pairs + + +def brain_query(token, query, k=5): + body = json.dumps({ + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": {"name": "brain_query", "arguments": {"query": query, "k": k}}, + }).encode() + req = urllib.request.Request( + ENDPOINT, + data=body, + headers={ + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + "Accept": "application/json, text/event-stream", + }, + method="POST", + ) + with urllib.request.urlopen(req, timeout=30) as r: + raw = r.read().decode() + for line in raw.splitlines(): + if line.startswith("data:"): + raw = line[5:].strip() + break + d = json.loads(raw) + if "error" in d: + raise RuntimeError(d["error"]) + text = d["result"]["content"][0]["text"] + return json.loads(text).get("results", []) + + +def slug_of(result): + # `title` mirrors the slug in brain_entities for normal entries. + # Fall back to basename(path) if title is missing. + t = result.get("title", "") + if t: + return t + p = result.get("path", "") + return re.sub(r"\.md$", "", os.path.basename(p)) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("evalset") + ap.add_argument("--baseline", default="run") + ap.add_argument("--k", type=int, default=5) + args = ap.parse_args() + + token = os.environ.get("BRAIN_MCP_TOKEN") + if not token: + sys.exit("BRAIN_MCP_TOKEN not set") + + pairs = load_pairs(args.evalset) + if not pairs: + sys.exit(f"no pairs in {args.evalset}") + + print(f"# {args.baseline} — {len(pairs)} questions, k={args.k}") + print() + hits1 = 0 + hits3 = 0 + detail = [] + for q, expected in pairs: + try: + results = brain_query(token, q, k=args.k) + except Exception as e: + detail.append((q, expected, [], f"ERR {e}")) + continue + slugs = [slug_of(r) for r in results] + rank = slugs.index(expected) + 1 if expected in slugs else 0 + h1 = 1 if rank == 1 else 0 + h3 = 1 if 0 < rank <= 3 else 0 + hits1 += h1 + hits3 += h3 + detail.append((q, expected, slugs, rank)) + + total = len(pairs) + print(f"top-1 hit rate: {hits1}/{total} = {100*hits1/total:.0f}%") + print(f"top-3 hit rate: {hits3}/{total} = {100*hits3/total:.0f}%") + print() + print("## per-question detail") + print() + for q, expected, slugs, rank in detail: + marker = {0: "✗", 1: "★", 2: "·", 3: "·"}.get(rank, "?") + if isinstance(rank, str): + marker = "!" + print(f"{marker} rank={rank} expected={expected}") + print(f" q: {q}") + for i, s in enumerate(slugs[:args.k], 1): + mark = " <-- expected" if s == expected else "" + print(f" {i}. {s}{mark}") + print() + + +if __name__ == "__main__": + main() diff --git a/ingestion/internal/graph/extract.go b/ingestion/internal/graph/extract.go index 57550cd..0281665 100644 --- a/ingestion/internal/graph/extract.go +++ b/ingestion/internal/graph/extract.go @@ -83,12 +83,23 @@ func slugFromPath(docPath string) string { // classifyByPath fills Type / Wing / Hall from the path layout when the // doc lives under brain/wiki/. Layout: wiki///.md // or wiki//.md for the legacy concept/entity/source dirs. +// +// Files directly under wiki/ (no subdirectory — e.g. wiki/index.md) used +// to incorrectly land Type="hall" Wing="index.md" because the path's +// second segment was the file itself. Now they fall through to Type +// "knowledge" and leave wing/hall to frontmatter. func classifyByPath(e *Entity, docPath string) { parts := strings.Split(docPath, "/") if len(parts) < 2 || parts[0] != "wiki" { e.Type = "knowledge" return } + if len(parts) < 3 { + // wiki/.md — no subdirectory. Treat as plain knowledge + // and let frontmatter set wing/hall if they're present. + e.Type = "knowledge" + return + } switch parts[1] { case "concepts": e.Type = "concept" diff --git a/ingestion/internal/graph/extract_test.go b/ingestion/internal/graph/extract_test.go index c785837..53faa0a 100644 --- a/ingestion/internal/graph/extract_test.go +++ b/ingestion/internal/graph/extract_test.go @@ -104,3 +104,31 @@ func TestExtract_LineNumbersAre1Indexed(t *testing.T) { require.Len(t, edges, 1) assert.Equal(t, 2, edges[0].SrcLine) } + +// Files directly under wiki/ (no subdirectory) used to land +// Type="hall" Wing=".md" because the path's second segment +// was the file itself. The fix routes them to Type="knowledge" with +// empty Wing/Hall and lets frontmatter set them if present. +func TestExtract_WikiRootFileIsKnowledgeNotHall(t *testing.T) { + content := []byte("# Index\n\n- [[foo]]\n") + ent, _, ok := Extract("wiki/index.md", content) + require.True(t, ok) + assert.Equal(t, "index", ent.Slug) + assert.Equal(t, "knowledge", ent.Type) + assert.Empty(t, ent.Wing) + assert.Empty(t, ent.Hall) +} + +func TestExtract_WikiRootFileWithFrontmatterWingHall(t *testing.T) { + content := []byte(`--- +wing: homelab +hall: facts +--- +# Some root note +`) + ent, _, ok := Extract("wiki/some-note.md", content) + require.True(t, ok) + assert.Equal(t, "knowledge", ent.Type) + assert.Equal(t, "homelab", ent.Wing) + assert.Equal(t, "facts", ent.Hall) +}