hyperguild/brain/eval/score.py

#!/usr/bin/env python3
"""Score brain_query against the qa-2026-05.md eval set.

Reads `q:` / `expected:` pairs, calls brain_query MCP for each, records
top-1 + top-3 hit rate. Run:

    BRAIN_MCP_TOKEN=$(grep '^export BRAIN_MCP_TOKEN=' ~/.llmkeys | cut -d= -f2-) \\
      python3 score.py qa-2026-05.md

Optionally pass --baseline <name> to save the result as a labeled run.
"""
import argparse
import json
import os
import re
import sys
import time
import urllib.request

ENDPOINT = "https://brain-mcp.d-ma.be/mcp"


def load_pairs(path):
    pairs = []
    q = None
    with open(path) as f:
        for line in f:
            line = line.rstrip()
            if line.startswith("q:"):
                q = line[2:].strip()
            elif line.startswith("expected:") and q is not None:
                expected = line[len("expected:"):].strip()
                pairs.append((q, expected))
                q = None
    return pairs


def brain_query(token, query, k=5):
    body = json.dumps({
        "jsonrpc": "2.0",
        "id": 1,
        "method": "tools/call",
        "params": {"name": "brain_query", "arguments": {"query": query, "k": k}},
    }).encode()
    req = urllib.request.Request(
        ENDPOINT,
        data=body,
        headers={
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json",
            "Accept": "application/json, text/event-stream",
        },
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=30) as r:
        raw = r.read().decode()
    for line in raw.splitlines():
        if line.startswith("data:"):
            raw = line[5:].strip()
            break
    d = json.loads(raw)
    if "error" in d:
        raise RuntimeError(d["error"])
    text = d["result"]["content"][0]["text"]
    return json.loads(text).get("results", [])


def slug_of(result):
    # `title` mirrors the slug in brain_entities for normal entries.
    # Fall back to basename(path) if title is missing.
    t = result.get("title", "")
    if t:
        return t
    p = result.get("path", "")
    return re.sub(r"\.md$", "", os.path.basename(p))


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("evalset")
    ap.add_argument("--baseline", default="run")
    ap.add_argument("--k", type=int, default=5)
    args = ap.parse_args()

    token = os.environ.get("BRAIN_MCP_TOKEN")
    if not token:
        sys.exit("BRAIN_MCP_TOKEN not set")

    pairs = load_pairs(args.evalset)
    if not pairs:
        sys.exit(f"no pairs in {args.evalset}")

    print(f"# {args.baseline} — {len(pairs)} questions, k={args.k}")
    print()
    hits1 = 0
    hits3 = 0
    detail = []
    for q, expected in pairs:
        try:
            results = brain_query(token, q, k=args.k)
        except Exception as e:
            detail.append((q, expected, [], f"ERR {e}"))
            continue
        slugs = [slug_of(r) for r in results]
        rank = slugs.index(expected) + 1 if expected in slugs else 0
        h1 = 1 if rank == 1 else 0
        h3 = 1 if 0 < rank <= 3 else 0
        hits1 += h1
        hits3 += h3
        detail.append((q, expected, slugs, rank))

    total = len(pairs)
    print(f"top-1 hit rate: {hits1}/{total} = {100*hits1/total:.0f}%")
    print(f"top-3 hit rate: {hits3}/{total} = {100*hits3/total:.0f}%")
    print()
    print("## per-question detail")
    print()
    for q, expected, slugs, rank in detail:
        marker = {0: "✗", 1: "★", 2: "·", 3: "·"}.get(rank, "?")
        if isinstance(rank, str):
            marker = "!"
        print(f"{marker} rank={rank}  expected={expected}")
        print(f"     q: {q}")
        for i, s in enumerate(slugs[:args.k], 1):
            mark = "  <-- expected" if s == expected else ""
            print(f"     {i}. {s}{mark}")
        print()


if __name__ == "__main__":
    main()