// Package reranker scores (query, document) pairs against a cross-encoder // served by an Ollama-compatible backend. // // Wire format is Ollama's `/api/generate`. The model is prompted with the // Qwen3-Reranker yes/no template — the canonical interface the model // itself was trained against — and the first token of the response is // treated as a binary relevance vote: "yes" → 1.0, anything else → 0.0. // Ties are expected to be broken by the caller's primary retrieval score // (e.g. BM25), so the binary signal is a filter rather than a ranking // substitute. package reranker import ( "bytes" "context" "encoding/json" "fmt" "io" "net/http" "strings" "time" ) // Client posts rerank requests to an Ollama-compatible endpoint. type Client struct { URL string Model string HTTP *http.Client } // New constructs a Client. Returns nil when url is empty so callers can // treat a missing BRAIN_RERANKER_URL as "feature disabled" with a single // nil check. func New(url, model string) *Client { if url == "" { return nil } return &Client{ URL: strings.TrimRight(url, "/"), Model: model, HTTP: &http.Client{Timeout: 30 * time.Second}, } } // Score returns one [0, 1] relevance score per input document, parallel // to the input order. Each (query, doc) pair is scored independently — // Qwen3-Reranker is a cross-encoder and expects per-pair calls. func (c *Client) Score(ctx context.Context, query string, docs []string) ([]float64, error) { out := make([]float64, len(docs)) for i, doc := range docs { s, err := c.scoreOne(ctx, query, doc) if err != nil { return nil, fmt.Errorf("rerank doc %d: %w", i, err) } out[i] = s } return out, nil } func (c *Client) scoreOne(ctx context.Context, query, doc string) (float64, error) { prompt := buildPrompt(query, doc) reqBody, _ := json.Marshal(map[string]any{ "model": c.Model, "prompt": prompt, "stream": false, "options": map[string]any{ "num_predict": 4, "temperature": 0, }, }) req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.URL+"/api/generate", bytes.NewReader(reqBody)) if err != nil { return 0, err } req.Header.Set("Content-Type", "application/json") resp, err := c.HTTP.Do(req) if err != nil { return 0, err } defer func() { _ = resp.Body.Close() }() if resp.StatusCode/100 != 2 { body, _ := io.ReadAll(resp.Body) return 0, fmt.Errorf("status %d: %s", resp.StatusCode, string(body)) } var out struct { Response string `json:"response"` } if err := json.NewDecoder(resp.Body).Decode(&out); err != nil { return 0, err } return parseYesNo(out.Response), nil } // buildPrompt assembles the Qwen3-Reranker chat template. Kept verbatim // because the model was trained on this exact wording. func buildPrompt(query, doc string) string { return "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n" + "<|im_start|>user\n: Given a web search query, retrieve relevant passages that answer the query\n" + ": " + query + "\n" + ": " + doc + "<|im_end|>\n" + "<|im_start|>assistant\n\n\n\n\n" } // parseYesNo extracts the first meaningful token from response and // returns 1.0 when it starts with "yes" (case-insensitive), 0.0 otherwise. // Any leading whitespace, `` block, or punctuation is skipped. func parseYesNo(s string) float64 { s = strings.TrimSpace(s) // Strip any `` block the model may emit even with empty thinking. if idx := strings.Index(s, ""); idx != -1 { s = strings.TrimSpace(s[idx+len(""):]) } s = strings.ToLower(s) if strings.HasPrefix(s, "yes") { return 1.0 } return 0.0 }