#!/usr/bin/env python3 """Score brain_query against the qa-2026-05.md eval set. Reads `q:` / `expected:` pairs, calls brain_query MCP for each, records top-1 + top-3 hit rate. Run: BRAIN_MCP_TOKEN=$(grep '^export BRAIN_MCP_TOKEN=' ~/.llmkeys | cut -d= -f2-) \\ python3 score.py qa-2026-05.md Optionally pass --baseline to save the result as a labeled run. """ import argparse import json import os import re import sys import time import urllib.request ENDPOINT = "https://brain-mcp.d-ma.be/mcp" def load_pairs(path): pairs = [] q = None with open(path) as f: for line in f: line = line.rstrip() if line.startswith("q:"): q = line[2:].strip() elif line.startswith("expected:") and q is not None: expected = line[len("expected:"):].strip() pairs.append((q, expected)) q = None return pairs def brain_query(token, query, k=5): body = json.dumps({ "jsonrpc": "2.0", "id": 1, "method": "tools/call", "params": {"name": "brain_query", "arguments": {"query": query, "k": k}}, }).encode() req = urllib.request.Request( ENDPOINT, data=body, headers={ "Authorization": f"Bearer {token}", "Content-Type": "application/json", "Accept": "application/json, text/event-stream", }, method="POST", ) with urllib.request.urlopen(req, timeout=30) as r: raw = r.read().decode() for line in raw.splitlines(): if line.startswith("data:"): raw = line[5:].strip() break d = json.loads(raw) if "error" in d: raise RuntimeError(d["error"]) text = d["result"]["content"][0]["text"] return json.loads(text).get("results", []) def slug_of(result): # `title` mirrors the slug in brain_entities for normal entries. # Fall back to basename(path) if title is missing. t = result.get("title", "") if t: return t p = result.get("path", "") return re.sub(r"\.md$", "", os.path.basename(p)) def main(): ap = argparse.ArgumentParser() ap.add_argument("evalset") ap.add_argument("--baseline", default="run") ap.add_argument("--k", type=int, default=5) args = ap.parse_args() token = os.environ.get("BRAIN_MCP_TOKEN") if not token: sys.exit("BRAIN_MCP_TOKEN not set") pairs = load_pairs(args.evalset) if not pairs: sys.exit(f"no pairs in {args.evalset}") print(f"# {args.baseline} — {len(pairs)} questions, k={args.k}") print() hits1 = 0 hits3 = 0 detail = [] for q, expected in pairs: try: results = brain_query(token, q, k=args.k) except Exception as e: detail.append((q, expected, [], f"ERR {e}")) continue slugs = [slug_of(r) for r in results] rank = slugs.index(expected) + 1 if expected in slugs else 0 h1 = 1 if rank == 1 else 0 h3 = 1 if 0 < rank <= 3 else 0 hits1 += h1 hits3 += h3 detail.append((q, expected, slugs, rank)) total = len(pairs) print(f"top-1 hit rate: {hits1}/{total} = {100*hits1/total:.0f}%") print(f"top-3 hit rate: {hits3}/{total} = {100*hits3/total:.0f}%") print() print("## per-question detail") print() for q, expected, slugs, rank in detail: marker = {0: "✗", 1: "★", 2: "·", 3: "·"}.get(rank, "?") if isinstance(rank, str): marker = "!" print(f"{marker} rank={rank} expected={expected}") print(f" q: {q}") for i, s in enumerate(slugs[:args.k], 1): mark = " <-- expected" if s == expected else "" print(f" {i}. {s}{mark}") print() if __name__ == "__main__": main()