Files
hyperguild/ingestion/internal/search/search.go
Mathias 4f78fecd06
All checks were successful
CI / Lint / Test / Vet (push) Successful in 12s
CI / Mirror to GitHub (push) Successful in 3s
feat(search): M4 tier-weighted BM25 re-rank (infra#72)
The eval set under brain/eval/qa-2026-05.md showed BM25 top-1 at 20%
with 5 of the missing slugs being short focused knowledge entries
that lost to long aggregate docs on raw term-frequency. Tier weighting
addresses that without touching the BM25 algorithm itself.

How

- Result struct gains a Tier field, populated during the file walk
  via extractTier (frontmatter wins, path prefix as fallback —
  mirrors the graph.inferTierFromPath logic so the two callers stay
  in lockstep).
- After the existing sort (and optional hybridMerge), do a final
  stable re-sort by float64(Score) * tierWeight(Tier). Knowledge
  ×1.5, note ×1.0, inbox ×0.3, unknown ×1.0.
- hydrate() (vector-only hits) also fills Tier so re-ranking covers
  the hybrid path.

Test covers the load-bearing case: a long note-tier doc with raw=10
loses to a short knowledge-tier doc with raw=8 after weighting
(8×1.5=12 vs 10×1.0=10).

Measurement gate is in infra#72: re-run brain/eval/score.py against
the live brain after this image lands; close the issue when top-1
hit rate lifts by ≥10 absolute points.
2026-05-25 18:45:20 +02:00

430 lines
12 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ingestion/internal/search/search.go
package search
import (
"bufio"
"context"
"fmt"
"log/slog"
"os"
"path/filepath"
"sort"
"strings"
"github.com/mathiasbq/hyperguild/ingestion/internal/brain"
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
)
// VectorSearcher returns the top-limit nearest paths by cosine
// distance. The vectorstore package implements this against pgvector.
type VectorSearcher interface {
Search(ctx context.Context, query []float32, limit int) ([]VectorHit, error)
}
// VectorHit is a single path + distance pair from a vector search.
// Re-declared here (rather than imported) to keep search package
// free of vectorstore/embed deps and to make stubbing trivial in tests.
type VectorHit struct {
Path string
Distance float64
}
// Embedder turns a query string into a dense vector. The embed package
// implements this against Ollama's /api/embed.
type Embedder interface {
Embed(ctx context.Context, text string) ([]float32, error)
}
// Result is a single search hit from the brain wiki.
type Result struct {
Path string `json:"path"`
Title string `json:"title"`
Excerpt string `json:"excerpt"`
Score int `json:"score"`
Wing string `json:"wing,omitempty"`
Hall string `json:"hall,omitempty"`
// Tier is the DIKW classification used for retrieval weighting
// (infra#72). Read from frontmatter when present, otherwise
// inferred from the parent directory.
Tier string `json:"tier,omitempty"`
}
// tierWeight maps the DIKW tier to a score multiplier applied right
// before the final truncation. Knowledge entries (focused lessons that
// age well) get boosted; inbox entries (raw captures, sessions, clips)
// get demoted. Empty / unknown tiers keep the original BM25 score
// (multiplier 1.0). See infra#72 for the failure mode this addresses:
// short focused entries lose to long aggregate dump-files under
// raw BM25 ranking.
func tierWeight(tier string) float64 {
switch tier {
case "knowledge":
return 1.5
case "note":
return 1.0
case "inbox":
return 0.3
default:
return 1.0
}
}
// QueryOptions configures a search.
//
// When Wing is set, the walk is restricted to brain/wiki/<wing>/.
// When Hall is additionally set, the walk is restricted to
// brain/wiki/<wing>/<hall>/. Without either, the legacy walk over
// brain/knowledge/ and brain/wiki/ is used.
//
// When both Vector and Embedder are non-nil, results are computed
// hybridly: BM25 and vector candidate lists are merged via Reciprocal
// Rank Fusion. With either nil the function falls back to BM25 only,
// keeping behaviour unchanged for callers that have not opted in.
type QueryOptions struct {
Query string
Limit int
Wing string
Hall string
Vector VectorSearcher
Embedder Embedder
}
// Query searches the brain. Returns up to opts.Limit results sorted by
// score descending. Empty query returns nil.
func Query(brainDir string, opts QueryOptions) ([]Result, error) {
return QueryContext(context.Background(), brainDir, opts)
}
// QueryContext is the cancellable variant of Query. Hybrid retrieval
// requires a context because both the embedder and the vector store are
// network calls.
func QueryContext(ctx context.Context, brainDir string, opts QueryOptions) ([]Result, error) {
if opts.Limit <= 0 {
opts.Limit = 5
}
terms := strings.Fields(strings.ToLower(opts.Query))
if len(terms) == 0 {
return nil, nil
}
roots, err := resolveRoots(brainDir, opts.Wing, opts.Hall)
if err != nil {
return nil, err
}
var results []Result
for _, dir := range roots {
if _, statErr := os.Stat(dir); os.IsNotExist(statErr) {
continue
}
err := filepath.WalkDir(dir, func(path string, d os.DirEntry, err error) error {
if err != nil {
slog.Warn("search: skipping path", "path", path, "err", err)
return nil
}
if d.IsDir() || !strings.HasSuffix(path, ".md") {
return nil
}
content, err := os.ReadFile(path)
if err != nil {
slog.Warn("search: skipping unreadable file", "path", path, "err", err)
return nil
}
lower := strings.ToLower(string(content))
score := 0
for _, term := range terms {
score += strings.Count(lower, term)
}
if score == 0 {
return nil
}
rel, err := filepath.Rel(brainDir, path)
if err != nil {
return fmt.Errorf("rel path: %w", err)
}
rel = filepath.ToSlash(rel)
wing, hall := extractWingHall(string(content), rel)
tier := extractTier(string(content), rel)
results = append(results, Result{
Path: rel,
Title: extractTitle(string(content), d.Name()),
Excerpt: excerpt(string(content), 300),
Score: score,
Wing: wing,
Hall: hall,
Tier: tier,
})
return nil
})
if err != nil {
return nil, err
}
}
sort.Slice(results, func(i, j int) bool {
return results[i].Score > results[j].Score
})
// Hybrid scoring kicks in only when both the embedder and the
// vector store are wired and BM25 actually returned candidates.
if opts.Vector != nil && opts.Embedder != nil && len(results) > 0 {
merged, err := hybridMerge(ctx, brainDir, opts, results)
if err != nil {
slog.Warn("search: hybrid merge failed, falling back to BM25", "err", err)
} else {
results = merged
}
}
// Tier-weighted final re-rank (infra#72). Knowledge tier entries
// boost ×1.5, inbox demote ×0.3, note stays at ×1.0. Applied after
// hybridMerge so RRF ranking still drives candidate generation;
// the tier weight only re-orders the merged set.
sort.SliceStable(results, func(i, j int) bool {
return float64(results[i].Score)*tierWeight(results[i].Tier) >
float64(results[j].Score)*tierWeight(results[j].Tier)
})
if len(results) > opts.Limit {
results = results[:opts.Limit]
}
return results, nil
}
// rrfK is the constant in the Reciprocal Rank Fusion formula. 60 is
// standard (Cormack et al. 2009) and parameter-free in practice.
const rrfK = 60.0
// hybridMerge embeds the query, runs a vector search, and merges its
// candidates with the BM25 list via Reciprocal Rank Fusion. Results
// that came only from the vector side are hydrated by reading the
// note's frontmatter for title/wing/hall and excerpting the body.
//
// rrf(d) = sum_r 1 / (k + rank_r(d)) over rankers r ∈ {BM25, vector}.
func hybridMerge(ctx context.Context, brainDir string, opts QueryOptions, bm25 []Result) ([]Result, error) {
q, err := opts.Embedder.Embed(ctx, opts.Query)
if err != nil {
return nil, fmt.Errorf("embed query: %w", err)
}
vectorLimit := opts.Limit * 4
if vectorLimit < 20 {
vectorLimit = 20
}
hits, err := opts.Vector.Search(ctx, q, vectorLimit)
if err != nil {
return nil, fmt.Errorf("vector search: %w", err)
}
rrf := make(map[string]float64)
byPath := make(map[string]Result)
for rank, r := range bm25 {
rrf[r.Path] += 1.0 / (rrfK + float64(rank+1))
byPath[r.Path] = r
}
for rank, h := range hits {
// Vector store keys are chunk paths ("wiki/foo.md#0001"); collapse
// back to the parent so multiple chunk hits from the same file
// score against a single result row.
parent := vectorstore.ParentPath(h.Path)
if opts.Wing != "" && !pathInScope(parent, opts.Wing, opts.Hall) {
continue
}
rrf[parent] += 1.0 / (rrfK + float64(rank+1))
if _, seen := byPath[parent]; !seen {
r, err := hydrate(brainDir, parent)
if err != nil {
slog.Warn("search: hydrate failed for vector hit", "path", parent, "err", err)
continue
}
byPath[parent] = r
}
}
merged := make([]Result, 0, len(byPath))
for p, r := range byPath {
r.Score = int(rrf[p] * 1e6) // scale to int for stable JSON; relative order is what matters
merged = append(merged, r)
}
sort.Slice(merged, func(i, j int) bool {
return merged[i].Score > merged[j].Score
})
return merged, nil
}
// pathInScope reports whether a wiki path satisfies the wing/hall filter.
func pathInScope(relPath, wing, hall string) bool {
prefix := "wiki/" + brain.Sanitise(wing) + "/"
if hall != "" {
prefix += hall + "/"
}
return strings.HasPrefix(relPath, prefix)
}
// hydrate reads a single note from disk and returns a Result with title,
// excerpt, wing, and hall populated. Used for paths that surface only
// via vector search.
func hydrate(brainDir, relPath string) (Result, error) {
full := filepath.Join(brainDir, filepath.FromSlash(relPath))
content, err := os.ReadFile(full)
if err != nil {
return Result{}, err
}
wing, hall := extractWingHall(string(content), relPath)
tier := extractTier(string(content), relPath)
return Result{
Path: relPath,
Title: extractTitle(string(content), filepath.Base(relPath)),
Excerpt: excerpt(string(content), 300),
Wing: wing,
Hall: hall,
Tier: tier,
}, nil
}
// resolveRoots returns the directories to walk for the given wing/hall
// filters. Validates hall against the closed vocabulary when set.
func resolveRoots(brainDir, wing, hall string) ([]string, error) {
if hall != "" && !brain.IsValidHall(hall) {
return nil, fmt.Errorf("invalid hall %q", hall)
}
if wing != "" {
w := brain.Sanitise(wing)
if w == "" {
return nil, fmt.Errorf("invalid wing %q", wing)
}
if hall != "" {
return []string{filepath.Join(brainDir, "wiki", w, hall)}, nil
}
return []string{filepath.Join(brainDir, "wiki", w)}, nil
}
if hall != "" {
return nil, fmt.Errorf("hall filter requires wing")
}
return []string{
filepath.Join(brainDir, "knowledge"),
filepath.Join(brainDir, "wiki"),
}, nil
}
// extractTier reads the DIKW tier from frontmatter first, falling back
// to the path prefix mapping (infra#72). Mirrors graph.inferTierFromPath
// so the two callers stay in lockstep — frontmatter is canonical,
// path inference is the migration-window fallback.
func extractTier(content, relPath string) string {
scanner := bufio.NewScanner(strings.NewReader(content))
inFrontmatter := false
for scanner.Scan() {
line := scanner.Text()
if strings.TrimSpace(line) == "---" {
if !inFrontmatter {
inFrontmatter = true
continue
}
break
}
if !inFrontmatter {
continue
}
key, val, ok := strings.Cut(line, ":")
if !ok {
continue
}
if strings.TrimSpace(key) == "tier" {
return strings.Trim(strings.TrimSpace(val), `"'`)
}
}
parts := strings.Split(relPath, "/")
if len(parts) == 0 {
return ""
}
switch parts[0] {
case "inbox", "raw", "sessions", "clips":
return "inbox"
case "notes":
return "note"
case "wiki":
return "note"
case "knowledge":
return "knowledge"
}
return ""
}
// extractWingHall reads wing/hall from frontmatter first, falling back to
// path segments brain/wiki/<wing>/<hall>/.
func extractWingHall(content, relPath string) (wing, hall string) {
scanner := bufio.NewScanner(strings.NewReader(content))
inFrontmatter := false
for scanner.Scan() {
line := scanner.Text()
if strings.TrimSpace(line) == "---" {
if !inFrontmatter {
inFrontmatter = true
continue
}
break
}
if !inFrontmatter {
continue
}
key, val, ok := strings.Cut(line, ":")
if !ok {
continue
}
v := strings.Trim(strings.TrimSpace(val), `"'`)
switch strings.TrimSpace(key) {
case "wing":
wing = v
case "hall":
hall = v
}
}
if wing != "" && hall != "" {
return wing, hall
}
parts := strings.Split(relPath, "/")
if len(parts) >= 4 && parts[0] == "wiki" {
if wing == "" {
wing = parts[1]
}
if hall == "" && brain.IsValidHall(parts[2]) {
hall = parts[2]
}
}
return wing, hall
}
func extractTitle(content, filename string) string {
scanner := bufio.NewScanner(strings.NewReader(content))
inFrontmatter := false
for scanner.Scan() {
line := scanner.Text()
if strings.TrimSpace(line) == "---" {
if !inFrontmatter {
inFrontmatter = true
continue
}
break
}
if inFrontmatter {
key, val, ok := strings.Cut(line, ":")
if ok && strings.TrimSpace(key) == "title" {
return strings.Trim(strings.TrimSpace(val), `"'`)
}
}
}
return strings.TrimSuffix(filename, ".md")
}
func excerpt(content string, maxLen int) string {
parts := strings.SplitN(content, "---", 3)
body := content
if len(parts) == 3 {
body = strings.TrimSpace(parts[2])
}
if len(body) > maxLen {
return body[:maxLen] + "…"
}
return body
}