hyperguild/ingestion/internal/reranker/reranker.go

// Package reranker scores (query, document) pairs against a cross-encoder
// served by an Ollama-compatible backend.
//
// Wire format is Ollama's `/api/generate`. The model is prompted with the
// Qwen3-Reranker yes/no template — the canonical interface the model
// itself was trained against — and the first token of the response is
// treated as a binary relevance vote: "yes" → 1.0, anything else → 0.0.
// Ties are expected to be broken by the caller's primary retrieval score
// (e.g. BM25), so the binary signal is a filter rather than a ranking
// substitute.
package reranker

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"strings"
	"time"
)

// Client posts rerank requests to an Ollama-compatible endpoint.
type Client struct {
	URL   string
	Model string
	HTTP  *http.Client
}

// New constructs a Client. Returns nil when url is empty so callers can
// treat a missing BRAIN_RERANKER_URL as "feature disabled" with a single
// nil check.
func New(url, model string) *Client {
	if url == "" {
		return nil
	}
	return &Client{
		URL:   strings.TrimRight(url, "/"),
		Model: model,
		HTTP:  &http.Client{Timeout: 30 * time.Second},
	}
}

// Score returns one [0, 1] relevance score per input document, parallel
// to the input order. Each (query, doc) pair is scored independently —
// Qwen3-Reranker is a cross-encoder and expects per-pair calls.
func (c *Client) Score(ctx context.Context, query string, docs []string) ([]float64, error) {
	out := make([]float64, len(docs))
	for i, doc := range docs {
		s, err := c.scoreOne(ctx, query, doc)
		if err != nil {
			return nil, fmt.Errorf("rerank doc %d: %w", i, err)
		}
		out[i] = s
	}
	return out, nil
}

func (c *Client) scoreOne(ctx context.Context, query, doc string) (float64, error) {
	prompt := buildPrompt(query, doc)
	reqBody, _ := json.Marshal(map[string]any{
		"model":  c.Model,
		"prompt": prompt,
		"stream": false,
		"options": map[string]any{
			"num_predict": 4,
			"temperature": 0,
		},
	})
	req, err := http.NewRequestWithContext(ctx, http.MethodPost,
		c.URL+"/api/generate", bytes.NewReader(reqBody))
	if err != nil {
		return 0, err
	}
	req.Header.Set("Content-Type", "application/json")
	resp, err := c.HTTP.Do(req)
	if err != nil {
		return 0, err
	}
	defer func() { _ = resp.Body.Close() }()
	if resp.StatusCode/100 != 2 {
		body, _ := io.ReadAll(resp.Body)
		return 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(body))
	}
	var out struct {
		Response string `json:"response"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
		return 0, err
	}
	return parseYesNo(out.Response), nil
}

// buildPrompt assembles the Qwen3-Reranker chat template. Kept verbatim
// because the model was trained on this exact wording.
func buildPrompt(query, doc string) string {
	return "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n" +
		"<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n" +
		"<Query>: " + query + "\n" +
		"<Document>: " + doc + "<|im_end|>\n" +
		"<|im_start|>assistant\n<think>\n\n</think>\n\n"
}

// parseYesNo extracts the first meaningful token from response and
// returns 1.0 when it starts with "yes" (case-insensitive), 0.0 otherwise.
// Any leading whitespace, `<think>` block, or punctuation is skipped.
func parseYesNo(s string) float64 {
	s = strings.TrimSpace(s)
	// Strip any `<think>…</think>` block the model may emit even with empty thinking.
	if idx := strings.Index(s, "</think>"); idx != -1 {
		s = strings.TrimSpace(s[idx+len("</think>"):])
	}
	s = strings.ToLower(s)
	if strings.HasPrefix(s, "yes") {
		return 1.0
	}
	return 0.0
}