Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6f1cb53295 | ||
|
|
37fdd33b2d |
@@ -12,6 +12,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/mathiasbq/hyperguild/ingestion/internal/brain"
|
"github.com/mathiasbq/hyperguild/ingestion/internal/brain"
|
||||||
|
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
|
||||||
)
|
)
|
||||||
|
|
||||||
// VectorSearcher returns the top-limit nearest paths by cosine
|
// VectorSearcher returns the top-limit nearest paths by cosine
|
||||||
@@ -186,17 +187,21 @@ func hybridMerge(ctx context.Context, brainDir string, opts QueryOptions, bm25 [
|
|||||||
byPath[r.Path] = r
|
byPath[r.Path] = r
|
||||||
}
|
}
|
||||||
for rank, h := range hits {
|
for rank, h := range hits {
|
||||||
if opts.Wing != "" && !pathInScope(h.Path, opts.Wing, opts.Hall) {
|
// Vector store keys are chunk paths ("wiki/foo.md#0001"); collapse
|
||||||
|
// back to the parent so multiple chunk hits from the same file
|
||||||
|
// score against a single result row.
|
||||||
|
parent := vectorstore.ParentPath(h.Path)
|
||||||
|
if opts.Wing != "" && !pathInScope(parent, opts.Wing, opts.Hall) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
rrf[h.Path] += 1.0 / (rrfK + float64(rank+1))
|
rrf[parent] += 1.0 / (rrfK + float64(rank+1))
|
||||||
if _, seen := byPath[h.Path]; !seen {
|
if _, seen := byPath[parent]; !seen {
|
||||||
r, err := hydrate(brainDir, h.Path)
|
r, err := hydrate(brainDir, parent)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("search: hydrate failed for vector hit", "path", h.Path, "err", err)
|
slog.Warn("search: hydrate failed for vector hit", "path", parent, "err", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
byPath[h.Path] = r
|
byPath[parent] = r
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -55,6 +55,36 @@ func TestSearch_HybridRRFPromotesVectorOnlyHit(t *testing.T) {
|
|||||||
assert.Contains(t, paths, "wiki/jepa-fx/facts/semantic.md")
|
assert.Contains(t, paths, "wiki/jepa-fx/facts/semantic.md")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSearch_HybridDedupesChunkPathsToParent(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
full := filepath.Join(dir, "knowledge", "long.md")
|
||||||
|
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
|
||||||
|
// Body contains the BM25 keyword "alpaca" so hybridMerge actually runs
|
||||||
|
// (it only kicks in when BM25 returns at least one candidate).
|
||||||
|
require.NoError(t, os.WriteFile(full, []byte("---\ntitle: Long\n---\nalpaca content.\n"), 0o644))
|
||||||
|
|
||||||
|
embedder := stubEmbedder{vec: []float32{0.1}}
|
||||||
|
// Vector store returns three chunk-path hits all pointing at the same
|
||||||
|
// parent file. The merged result must surface ONE row per parent — not
|
||||||
|
// three rows with chunk-suffixed paths.
|
||||||
|
vector := stubVector{hits: []search.VectorHit{
|
||||||
|
{Path: "knowledge/long.md#0001", Distance: 0.05},
|
||||||
|
{Path: "knowledge/long.md#0002", Distance: 0.07},
|
||||||
|
{Path: "knowledge/long.md#0003", Distance: 0.09},
|
||||||
|
}}
|
||||||
|
|
||||||
|
got, err := search.Query(dir, search.QueryOptions{
|
||||||
|
Query: "alpaca",
|
||||||
|
Limit: 5,
|
||||||
|
Vector: vector,
|
||||||
|
Embedder: embedder,
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, got, 1, "three chunk hits for one parent must merge to one result")
|
||||||
|
assert.Equal(t, "knowledge/long.md", got[0].Path)
|
||||||
|
assert.Equal(t, "Long", got[0].Title)
|
||||||
|
}
|
||||||
|
|
||||||
func TestSearch_HybridFallsBackOnEmbedderError(t *testing.T) {
|
func TestSearch_HybridFallsBackOnEmbedderError(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
||||||
|
|||||||
137
ingestion/internal/vectorstore/chunk.go
Normal file
137
ingestion/internal/vectorstore/chunk.go
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
package vectorstore
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NumberedChunk pairs a chunk's body with the storage path it will use
|
||||||
|
// in brain_embeddings. Path format: "<parent>#NNNN" where NNNN is the
|
||||||
|
// 1-based chunk index zero-padded to 4 digits.
|
||||||
|
type NumberedChunk struct {
|
||||||
|
Path string
|
||||||
|
Content string
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParentPath returns the file path with any "#NNNN" chunk suffix removed.
|
||||||
|
// Inputs without a "#" are returned unchanged. Used by search to dedupe
|
||||||
|
// chunk-level hits back to a single document per result.
|
||||||
|
func ParentPath(p string) string {
|
||||||
|
if i := strings.Index(p, "#"); i >= 0 {
|
||||||
|
return p[:i]
|
||||||
|
}
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
|
||||||
|
// NumberChunks assigns "<parent>#NNNN" storage paths to a slice of chunk
|
||||||
|
// bodies, indexed from 0001. Empty chunks are dropped.
|
||||||
|
func NumberChunks(parent string, chunks []string) []NumberedChunk {
|
||||||
|
out := make([]NumberedChunk, 0, len(chunks))
|
||||||
|
idx := 1
|
||||||
|
for _, c := range chunks {
|
||||||
|
if strings.TrimSpace(c) == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, NumberedChunk{
|
||||||
|
Path: fmt.Sprintf("%s#%04d", parent, idx),
|
||||||
|
Content: c,
|
||||||
|
})
|
||||||
|
idx++
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// ChunkMarkdown splits a markdown document into embedding-sized pieces.
|
||||||
|
// Strategy:
|
||||||
|
// 1. Split at H1/H2 headings (top-of-line "#" or "##"). The intro before
|
||||||
|
// the first heading is its own chunk.
|
||||||
|
// 2. Any section larger than maxBytes is further split at paragraph
|
||||||
|
// boundaries (blank lines), packing paragraphs greedily under the
|
||||||
|
// byte budget.
|
||||||
|
//
|
||||||
|
// The function aims for "fits comfortably under nomic-embed-text's 2048-
|
||||||
|
// token context" — at ~4 chars/token for English markdown, maxBytes ≈ 4000
|
||||||
|
// is a safe call-site default.
|
||||||
|
func ChunkMarkdown(content string, maxBytes int) []string {
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = 4000
|
||||||
|
}
|
||||||
|
sections := splitAtHeadings(content)
|
||||||
|
|
||||||
|
out := make([]string, 0, len(sections))
|
||||||
|
for _, s := range sections {
|
||||||
|
if len(s) <= maxBytes {
|
||||||
|
out = append(out, s)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, splitAtParagraphs(s, maxBytes)...)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// splitAtHeadings cuts content into sections that each start with an
|
||||||
|
// "# " or "## " line (intro before any heading is the leading section).
|
||||||
|
func splitAtHeadings(content string) []string {
|
||||||
|
lines := strings.Split(content, "\n")
|
||||||
|
var sections []string
|
||||||
|
var cur strings.Builder
|
||||||
|
flush := func() {
|
||||||
|
if cur.Len() == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Trim all trailing whitespace then re-add a single newline so a
|
||||||
|
// single-paragraph file round-trips to its original content rather
|
||||||
|
// than accumulating extra newlines from the empty-line split.
|
||||||
|
s := strings.TrimRight(cur.String(), "\n")
|
||||||
|
sections = append(sections, s+"\n")
|
||||||
|
cur.Reset()
|
||||||
|
}
|
||||||
|
for _, ln := range lines {
|
||||||
|
trimmed := strings.TrimLeft(ln, " ")
|
||||||
|
isH := strings.HasPrefix(trimmed, "# ") || strings.HasPrefix(trimmed, "## ")
|
||||||
|
if isH && cur.Len() > 0 {
|
||||||
|
flush()
|
||||||
|
}
|
||||||
|
cur.WriteString(ln)
|
||||||
|
cur.WriteByte('\n')
|
||||||
|
}
|
||||||
|
flush()
|
||||||
|
// Drop empty / whitespace-only trailing section (common when content
|
||||||
|
// itself ends with a "\n" — Split leaves a final empty element).
|
||||||
|
if n := len(sections); n > 0 && strings.TrimSpace(sections[n-1]) == "" {
|
||||||
|
sections = sections[:n-1]
|
||||||
|
}
|
||||||
|
return sections
|
||||||
|
}
|
||||||
|
|
||||||
|
// splitAtParagraphs packs paragraphs (blank-line separated blocks) into
|
||||||
|
// sub-chunks of at most maxBytes. A single paragraph that itself exceeds
|
||||||
|
// maxBytes is emitted as one over-budget chunk rather than being split
|
||||||
|
// mid-sentence — better to over-spend a little than truncate prose.
|
||||||
|
func splitAtParagraphs(section string, maxBytes int) []string {
|
||||||
|
paras := strings.Split(section, "\n\n")
|
||||||
|
var out []string
|
||||||
|
var cur strings.Builder
|
||||||
|
for _, p := range paras {
|
||||||
|
if p == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// +2 for the "\n\n" rejoin if cur isn't empty
|
||||||
|
need := len(p)
|
||||||
|
if cur.Len() > 0 {
|
||||||
|
need += 2
|
||||||
|
}
|
||||||
|
if cur.Len() > 0 && cur.Len()+need > maxBytes {
|
||||||
|
out = append(out, cur.String())
|
||||||
|
cur.Reset()
|
||||||
|
}
|
||||||
|
if cur.Len() > 0 {
|
||||||
|
cur.WriteString("\n\n")
|
||||||
|
}
|
||||||
|
cur.WriteString(p)
|
||||||
|
}
|
||||||
|
if cur.Len() > 0 {
|
||||||
|
out = append(out, cur.String())
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
72
ingestion/internal/vectorstore/chunk_test.go
Normal file
72
ingestion/internal/vectorstore/chunk_test.go
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
package vectorstore_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestChunkMarkdown_ShortFileFitsInOne(t *testing.T) {
|
||||||
|
out := vectorstore.ChunkMarkdown("Just a short paragraph.\n", 4000)
|
||||||
|
require.Len(t, out, 1)
|
||||||
|
assert.Equal(t, "Just a short paragraph.\n", out[0])
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChunkMarkdown_SplitsAtHeadings(t *testing.T) {
|
||||||
|
src := "# Top\n\nintro\n\n## A\n\nbody a\n\n## B\n\nbody b\n"
|
||||||
|
out := vectorstore.ChunkMarkdown(src, 50) // tiny limit forces per-section split
|
||||||
|
|
||||||
|
assert.GreaterOrEqual(t, len(out), 2, "should split at H2 boundaries")
|
||||||
|
// Each chunk should start with a heading (top-level intro chunk OK without one)
|
||||||
|
for i, c := range out {
|
||||||
|
if i == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
assert.True(t, strings.HasPrefix(strings.TrimSpace(c), "#"),
|
||||||
|
"non-first chunk %d should start with heading: %q", i, c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChunkMarkdown_FurtherSplitsOversizedSection(t *testing.T) {
|
||||||
|
// One H2 section with 4 paragraphs of ~80 chars each, limit 100.
|
||||||
|
src := "## big\n\n" +
|
||||||
|
strings.Repeat("paragraph one is moderately long.\n\n", 1) +
|
||||||
|
strings.Repeat("paragraph two also moderately long.\n\n", 1) +
|
||||||
|
strings.Repeat("paragraph three is moderately long.\n\n", 1) +
|
||||||
|
strings.Repeat("paragraph four is moderately long.\n\n", 1)
|
||||||
|
out := vectorstore.ChunkMarkdown(src, 100)
|
||||||
|
|
||||||
|
assert.Greater(t, len(out), 1, "oversized section should sub-split at paragraph boundaries")
|
||||||
|
for i, c := range out {
|
||||||
|
assert.LessOrEqual(t, len(c), 200,
|
||||||
|
"chunk %d exceeds 2x maxBytes: %d", i, len(c))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChunkMarkdown_PreservesContent(t *testing.T) {
|
||||||
|
src := "# H1\n\nfirst section body.\n\n## H2a\n\nsecond section body.\n\n## H2b\n\nthird section body.\n"
|
||||||
|
out := vectorstore.ChunkMarkdown(src, 50)
|
||||||
|
joined := strings.Join(out, "")
|
||||||
|
// All non-whitespace tokens from src must appear in the joined output
|
||||||
|
for _, token := range []string{"H1", "first", "H2a", "second", "H2b", "third"} {
|
||||||
|
assert.Contains(t, joined, token, "token %q missing after chunking", token)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChunkMarkdown_NumberedSuffix(t *testing.T) {
|
||||||
|
out := vectorstore.NumberChunks("knowledge/foo.md", []string{"a", "b", "c"})
|
||||||
|
require.Len(t, out, 3)
|
||||||
|
assert.Equal(t, "knowledge/foo.md#0001", out[0].Path)
|
||||||
|
assert.Equal(t, "knowledge/foo.md#0002", out[1].Path)
|
||||||
|
assert.Equal(t, "knowledge/foo.md#0003", out[2].Path)
|
||||||
|
assert.Equal(t, "a", out[0].Content)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParentPath_StripsChunkSuffix(t *testing.T) {
|
||||||
|
assert.Equal(t, "knowledge/foo.md", vectorstore.ParentPath("knowledge/foo.md#0001"))
|
||||||
|
assert.Equal(t, "knowledge/foo.md", vectorstore.ParentPath("knowledge/foo.md"))
|
||||||
|
assert.Equal(t, "wiki/a/b.md", vectorstore.ParentPath("wiki/a/b.md#9999"))
|
||||||
|
}
|
||||||
@@ -37,6 +37,13 @@ type SyncResult struct {
|
|||||||
// source pages; knowledge/ holds curated hand-written entries.
|
// source pages; knowledge/ holds curated hand-written entries.
|
||||||
var scanDirs = []string{"wiki", "knowledge"}
|
var scanDirs = []string{"wiki", "knowledge"}
|
||||||
|
|
||||||
|
// maxChunkBytes is the per-chunk byte budget passed to ChunkMarkdown.
|
||||||
|
// Sized to fit comfortably under nomic-embed-text's 2048-token default
|
||||||
|
// context (~4 chars/token for English markdown → ~8 KB ceiling; we sit
|
||||||
|
// at 4 KB to leave headroom for unicode, code blocks, and tokenizer
|
||||||
|
// variance).
|
||||||
|
const maxChunkBytes = 4000
|
||||||
|
|
||||||
// Sync brings the embedding store in line with brain/{wiki,knowledge}/
|
// Sync brings the embedding store in line with brain/{wiki,knowledge}/
|
||||||
// on disk:
|
// on disk:
|
||||||
// - new files (in the tree, not in the store) get embedded + upserted
|
// - new files (in the tree, not in the store) get embedded + upserted
|
||||||
@@ -55,7 +62,13 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return res, fmt.Errorf("known paths: %w", err)
|
return res, fmt.Errorf("known paths: %w", err)
|
||||||
}
|
}
|
||||||
seen := make(map[string]struct{})
|
// Build a parent → "any chunk known?" set so we can skip files that
|
||||||
|
// already have at least one chunk row in the store.
|
||||||
|
knownParents := make(map[string]struct{}, len(known))
|
||||||
|
for p := range known {
|
||||||
|
knownParents[ParentPath(p)] = struct{}{}
|
||||||
|
}
|
||||||
|
seenParents := make(map[string]struct{})
|
||||||
|
|
||||||
for _, sub := range scanDirs {
|
for _, sub := range scanDirs {
|
||||||
root := filepath.Join(brainDir, sub)
|
root := filepath.Join(brainDir, sub)
|
||||||
@@ -75,11 +88,12 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
relSlash := filepath.ToSlash(rel)
|
relSlash := filepath.ToSlash(rel)
|
||||||
seen[relSlash] = struct{}{}
|
seenParents[relSlash] = struct{}{}
|
||||||
|
|
||||||
if _, ok := known[relSlash]; ok {
|
if _, ok := knownParents[relSlash]; ok {
|
||||||
// Already embedded — TODO: compare mtime once Store exposes
|
// File has at least one chunk in the store already.
|
||||||
// updated_at so we re-embed on edit. For now, skip.
|
// TODO: compare mtime once Store exposes updated_at so we
|
||||||
|
// re-embed on edit. For now, skip.
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -88,16 +102,19 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
|
|||||||
res.Errors = append(res.Errors, fmt.Errorf("read %s: %w", relSlash, readErr))
|
res.Errors = append(res.Errors, fmt.Errorf("read %s: %w", relSlash, readErr))
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
vec, embErr := embedder.Embed(ctx, string(content))
|
chunks := NumberChunks(relSlash, ChunkMarkdown(string(content), maxChunkBytes))
|
||||||
|
for _, ch := range chunks {
|
||||||
|
vec, embErr := embedder.Embed(ctx, ch.Content)
|
||||||
if embErr != nil {
|
if embErr != nil {
|
||||||
res.Errors = append(res.Errors, fmt.Errorf("embed %s: %w", relSlash, embErr))
|
res.Errors = append(res.Errors, fmt.Errorf("embed %s: %w", ch.Path, embErr))
|
||||||
return nil
|
continue
|
||||||
}
|
}
|
||||||
if upErr := store.Upsert(ctx, relSlash, vec); upErr != nil {
|
if upErr := store.Upsert(ctx, ch.Path, vec); upErr != nil {
|
||||||
res.Errors = append(res.Errors, fmt.Errorf("upsert %s: %w", relSlash, upErr))
|
res.Errors = append(res.Errors, fmt.Errorf("upsert %s: %w", ch.Path, upErr))
|
||||||
return nil
|
continue
|
||||||
}
|
}
|
||||||
res.Added++
|
res.Added++
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -105,9 +122,9 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Drop rows whose file is gone.
|
// Drop chunk rows whose parent file is gone.
|
||||||
for path := range known {
|
for path := range known {
|
||||||
if _, ok := seen[path]; ok {
|
if _, ok := seenParents[ParentPath(path)]; ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if err := store.Delete(ctx, path); err != nil {
|
if err := store.Delete(ctx, path); err != nil {
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
|
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
|
||||||
@@ -72,15 +73,15 @@ func TestSync_AddsNewFiles(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
assert.Equal(t, 2, res.Added)
|
assert.Equal(t, 2, res.Added)
|
||||||
assert.Empty(t, res.Deleted)
|
assert.Empty(t, res.Deleted)
|
||||||
assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/x.md")
|
assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/x.md#0001")
|
||||||
assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/y.md")
|
assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/y.md#0001")
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestSync_SkipsAlreadyKnown(t *testing.T) {
|
func TestSync_SkipsAlreadyKnown(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
||||||
|
|
||||||
store := &stubStore{known: map[string]struct{}{"wiki/a/facts/x.md": {}}}
|
store := &stubStore{known: map[string]struct{}{"wiki/a/facts/x.md#0001": {}}}
|
||||||
emb := stubEmbedder{vec: make([]float32, 768)}
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
||||||
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
@@ -92,7 +93,7 @@ func TestSync_DeletesDisappearedFiles(t *testing.T) {
|
|||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
||||||
// store has a path that doesn't exist on disk anymore
|
// store has a path that doesn't exist on disk anymore
|
||||||
store := &stubStore{known: map[string]struct{}{"wiki/old/facts/ghost.md": {}}}
|
store := &stubStore{known: map[string]struct{}{"wiki/old/facts/ghost.md#0001": {}}}
|
||||||
res, err := vectorstore.Sync(context.Background(), dir, &stubStoreWithDelete{stubStore: store}, stubEmbedder{vec: make([]float32, 768)})
|
res, err := vectorstore.Sync(context.Background(), dir, &stubStoreWithDelete{stubStore: store}, stubEmbedder{vec: make([]float32, 768)})
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
assert.Equal(t, 1, res.Deleted)
|
assert.Equal(t, 1, res.Deleted)
|
||||||
@@ -114,7 +115,7 @@ func TestSync_SkipsIndexFiles(t *testing.T) {
|
|||||||
res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
|
res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
assert.Equal(t, 1, res.Added)
|
assert.Equal(t, 1, res.Added)
|
||||||
assert.NotContains(t, store.upserts, "wiki/a/_index.md")
|
assert.NotContains(t, store.upserts, "wiki/a/_index.md#0001")
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestSync_ScansKnowledgeDir(t *testing.T) {
|
func TestSync_ScansKnowledgeDir(t *testing.T) {
|
||||||
@@ -127,8 +128,75 @@ func TestSync_ScansKnowledgeDir(t *testing.T) {
|
|||||||
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
assert.Equal(t, 2, res.Added)
|
assert.Equal(t, 2, res.Added)
|
||||||
assert.Contains(t, store.upserts, "wiki/a/facts/x.md")
|
assert.Contains(t, store.upserts, "wiki/a/facts/x.md#0001")
|
||||||
assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md")
|
assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md#0001")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSync_ChunksLongFiles(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
// Build a file that's well over the chunk byte budget. Multi-section
|
||||||
|
// markdown so the chunker has heading boundaries to cut on.
|
||||||
|
body := "# Doc\n\nintro line.\n\n"
|
||||||
|
for i := 0; i < 10; i++ {
|
||||||
|
body += "## Section " + string(rune('A'+i)) + "\n\n"
|
||||||
|
body += strings.Repeat("This section has a fair amount of content. ", 50) + "\n\n"
|
||||||
|
}
|
||||||
|
writeNote(t, dir, "knowledge/long.md", body)
|
||||||
|
|
||||||
|
store := &stubStore{known: map[string]struct{}{}}
|
||||||
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
||||||
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Greater(t, res.Added, 1, "long file should produce multiple chunk rows")
|
||||||
|
// Every upserted path for this file must be a chunk path.
|
||||||
|
chunkCount := 0
|
||||||
|
for p := range store.upserts {
|
||||||
|
if strings.HasPrefix(p, "knowledge/long.md#") {
|
||||||
|
chunkCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert.Equal(t, res.Added, chunkCount, "all rows for long file should be chunk-suffixed")
|
||||||
|
// The bare parent path must NOT be upserted directly.
|
||||||
|
assert.NotContains(t, store.upserts, "knowledge/long.md")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSync_ShortFileGetsSingleChunkRow(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
writeNote(t, dir, "wiki/short.md", "tiny body\n")
|
||||||
|
|
||||||
|
store := &stubStore{known: map[string]struct{}{}}
|
||||||
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
||||||
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, 1, res.Added)
|
||||||
|
assert.Contains(t, store.upserts, "wiki/short.md#0001")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSync_SkipsFileIfAnyChunkAlreadyKnown(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
writeNote(t, dir, "wiki/foo.md", "body\n")
|
||||||
|
|
||||||
|
store := &stubStore{known: map[string]struct{}{
|
||||||
|
"wiki/foo.md#0001": {},
|
||||||
|
}}
|
||||||
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
||||||
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, 0, res.Added)
|
||||||
|
assert.Empty(t, store.upserts)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSync_DeletesAllChunksOfDisappearedFile(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
||||||
|
store := &stubStore{known: map[string]struct{}{
|
||||||
|
"wiki/ghost.md#0001": {},
|
||||||
|
"wiki/ghost.md#0002": {},
|
||||||
|
"wiki/ghost.md#0003": {},
|
||||||
|
}}
|
||||||
|
res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, 3, res.Deleted)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestSync_NoOpWhenComponentsNil(t *testing.T) {
|
func TestSync_NoOpWhenComponentsNil(t *testing.T) {
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ type createArgs struct {
|
|||||||
Folder string `json:"folder"`
|
Folder string `json:"folder"`
|
||||||
Stack string `json:"stack"`
|
Stack string `json:"stack"`
|
||||||
Private bool `json:"private"`
|
Private bool `json:"private"`
|
||||||
|
MirrorToGitHub bool `json:"mirror_to_github,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type createResult struct {
|
type createResult struct {
|
||||||
@@ -59,11 +60,12 @@ func (s *Skill) handleCreate(ctx context.Context, raw json.RawMessage) (json.Raw
|
|||||||
|
|
||||||
tmpl := templateFor(args.Stack)
|
tmpl := templateFor(args.Stack)
|
||||||
giteaURL := fmt.Sprintf("http://gitea.d-ma.be/%s/%s", s.cfg.GiteaOwner, args.Name)
|
giteaURL := fmt.Sprintf("http://gitea.d-ma.be/%s/%s", s.cfg.GiteaOwner, args.Name)
|
||||||
githubURL := fmt.Sprintf("https://github.com/%s/%s", s.cfg.GitHubOwner, args.Name)
|
|
||||||
|
|
||||||
res := createResult{
|
res := createResult{
|
||||||
GiteaURL: giteaURL,
|
GiteaURL: giteaURL,
|
||||||
GitHubURL: githubURL,
|
}
|
||||||
|
if args.MirrorToGitHub {
|
||||||
|
res.GitHubURL = fmt.Sprintf("https://github.com/%s/%s", s.cfg.GitHubOwner, args.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 1: create_project_from_template. If the repo already exists,
|
// Step 1: create_project_from_template. If the repo already exists,
|
||||||
@@ -75,6 +77,12 @@ func (s *Skill) handleCreate(ctx context.Context, raw json.RawMessage) (json.Raw
|
|||||||
}
|
}
|
||||||
res.Reached = append(res.Reached, stepCreateRepo)
|
res.Reached = append(res.Reached, stepCreateRepo)
|
||||||
|
|
||||||
|
// Steps 2+3 are skipped when MirrorToGitHub is false. Default per
|
||||||
|
// infra ADR (Gitea as true master, GitHub as optional opt-in): keep
|
||||||
|
// client / business-logic / personal repos Gitea-only. Set
|
||||||
|
// `mirror_to_github: true` for open-source projects that want a
|
||||||
|
// public GitHub mirror (hyperguild, gitea-mcp, template-*).
|
||||||
|
if args.MirrorToGitHub {
|
||||||
// Step 2: create empty GitHub repo. Gitea's push-mirror cannot push
|
// Step 2: create empty GitHub repo. Gitea's push-mirror cannot push
|
||||||
// to a non-existent remote, so the destination must exist before
|
// to a non-existent remote, so the destination must exist before
|
||||||
// step 3 configures the mirror. Skipped when GitHub client is unset
|
// step 3 configures the mirror. Skipped when GitHub client is unset
|
||||||
@@ -94,6 +102,7 @@ func (s *Skill) handleCreate(ctx context.Context, raw json.RawMessage) (json.Raw
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
res.Reached = append(res.Reached, stepMirror)
|
res.Reached = append(res.Reached, stepMirror)
|
||||||
|
}
|
||||||
|
|
||||||
// Step 3: commit staging namespace manifest to infra repo. Done before
|
// Step 3: commit staging namespace manifest to infra repo. Done before
|
||||||
// the issue so the staging env is reconciling by the time the issue lands.
|
// the issue so the staging env is reconciling by the time the issue lands.
|
||||||
@@ -228,7 +237,11 @@ func experimentBrief(args createArgs, existed bool) string {
|
|||||||
b.WriteString("- Repo created from `template-")
|
b.WriteString("- Repo created from `template-")
|
||||||
b.WriteString(args.Stack)
|
b.WriteString(args.Stack)
|
||||||
b.WriteString("` on Gitea.\n")
|
b.WriteString("` on Gitea.\n")
|
||||||
|
if args.MirrorToGitHub {
|
||||||
b.WriteString("- Push-mirror configured to GitHub.\n")
|
b.WriteString("- Push-mirror configured to GitHub.\n")
|
||||||
|
} else {
|
||||||
|
b.WriteString("- Gitea-only (no GitHub mirror — set `mirror_to_github: true` to opt in).\n")
|
||||||
|
}
|
||||||
b.WriteString("- Staging namespace manifest committed to infra repo.\n\n")
|
b.WriteString("- Staging namespace manifest committed to infra repo.\n\n")
|
||||||
if existed {
|
if existed {
|
||||||
b.WriteString("> Note: this repo already existed when `project_create` ran — provisioning steps were re-applied idempotently.\n")
|
b.WriteString("> Note: this repo already existed when `project_create` ran — provisioning steps were re-applied idempotently.\n")
|
||||||
|
|||||||
@@ -158,6 +158,9 @@ func mustClient(t *testing.T, url string) *mcpclient.Client {
|
|||||||
return c
|
return c
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// happyArgs returns the minimal valid request. With the Gitea-as-true-master
|
||||||
|
// ADR shipped, this defaults to Gitea-only (mirror_to_github omitted = false).
|
||||||
|
// Tests that need the full Gitea + GitHub mirror flow use mirroredArgs().
|
||||||
func happyArgs() json.RawMessage {
|
func happyArgs() json.RawMessage {
|
||||||
return json.RawMessage(`{
|
return json.RawMessage(`{
|
||||||
"name":"my-experiment",
|
"name":"my-experiment",
|
||||||
@@ -169,6 +172,20 @@ func happyArgs() json.RawMessage {
|
|||||||
}`)
|
}`)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// mirroredArgs is happyArgs + mirror_to_github=true — the explicit opt-in
|
||||||
|
// path. Equivalent to the pre-ADR default.
|
||||||
|
func mirroredArgs() json.RawMessage {
|
||||||
|
return json.RawMessage(`{
|
||||||
|
"name":"my-experiment",
|
||||||
|
"description":"One-line desc",
|
||||||
|
"hypothesis":"We believe X produces Y",
|
||||||
|
"folder":"AGENTS",
|
||||||
|
"stack":"go-agent",
|
||||||
|
"private":true,
|
||||||
|
"mirror_to_github":true
|
||||||
|
}`)
|
||||||
|
}
|
||||||
|
|
||||||
func TestProjectCreate_HappyPath(t *testing.T) {
|
func TestProjectCreate_HappyPath(t *testing.T) {
|
||||||
f := &fakeGiteaMCP{
|
f := &fakeGiteaMCP{
|
||||||
Responses: map[string]any{
|
Responses: map[string]any{
|
||||||
@@ -177,7 +194,7 @@ func TestProjectCreate_HappyPath(t *testing.T) {
|
|||||||
}
|
}
|
||||||
skill, gh := newSkill(t, f)
|
skill, gh := newSkill(t, f)
|
||||||
|
|
||||||
out, err := skill.Handle(context.Background(), "project_create", happyArgs())
|
out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
var res map[string]any
|
var res map[string]any
|
||||||
@@ -228,7 +245,7 @@ func TestProjectCreate_GitHubExists_Idempotent(t *testing.T) {
|
|||||||
skill, gh := newSkill(t, f)
|
skill, gh := newSkill(t, f)
|
||||||
gh.ReturnError = 422 // already exists
|
gh.ReturnError = 422 // already exists
|
||||||
|
|
||||||
_, err := skill.Handle(context.Background(), "project_create", happyArgs())
|
_, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
|
||||||
require.NoError(t, err, "422 already-exists should be idempotent")
|
require.NoError(t, err, "422 already-exists should be idempotent")
|
||||||
require.Len(t, f.Calls, 4, "all gitea steps still run despite github 422")
|
require.Len(t, f.Calls, 4, "all gitea steps still run despite github 422")
|
||||||
}
|
}
|
||||||
@@ -238,7 +255,7 @@ func TestProjectCreate_GitHubFails(t *testing.T) {
|
|||||||
skill, gh := newSkill(t, f)
|
skill, gh := newSkill(t, f)
|
||||||
gh.ReturnError = 401 // bad PAT
|
gh.ReturnError = 401 // bad PAT
|
||||||
|
|
||||||
out, err := skill.Handle(context.Background(), "project_create", happyArgs())
|
out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
|
||||||
require.Error(t, err)
|
require.Error(t, err)
|
||||||
var res map[string]any
|
var res map[string]any
|
||||||
require.NoError(t, json.Unmarshal(out, &res))
|
require.NoError(t, json.Unmarshal(out, &res))
|
||||||
@@ -255,7 +272,11 @@ func TestProjectCreate_NoGitHubClient_DegradedMode(t *testing.T) {
|
|||||||
}
|
}
|
||||||
skill := newSkillNoGitHub(t, f)
|
skill := newSkillNoGitHub(t, f)
|
||||||
|
|
||||||
out, err := skill.Handle(context.Background(), "project_create", happyArgs())
|
// Use mirroredArgs so we exercise the GitHub-mirror path. With the
|
||||||
|
// GitHub client nil, the create_github_repo step is skipped but the
|
||||||
|
// mirror step still attempts to configure the push-mirror remote
|
||||||
|
// (degraded mode preserves the prior contract for opted-in projects).
|
||||||
|
out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
var res map[string]any
|
var res map[string]any
|
||||||
require.NoError(t, json.Unmarshal(out, &res))
|
require.NoError(t, json.Unmarshal(out, &res))
|
||||||
@@ -275,7 +296,7 @@ func TestProjectCreate_Idempotent_RepoExists(t *testing.T) {
|
|||||||
}
|
}
|
||||||
skill, _ := newSkill(t, f)
|
skill, _ := newSkill(t, f)
|
||||||
|
|
||||||
out, err := skill.Handle(context.Background(), "project_create", happyArgs())
|
out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
var res map[string]any
|
var res map[string]any
|
||||||
@@ -295,7 +316,7 @@ func TestProjectCreate_MirrorFails(t *testing.T) {
|
|||||||
}
|
}
|
||||||
skill, _ := newSkill(t, f)
|
skill, _ := newSkill(t, f)
|
||||||
|
|
||||||
out, err := skill.Handle(context.Background(), "project_create", happyArgs())
|
out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
|
||||||
require.Error(t, err)
|
require.Error(t, err)
|
||||||
assert.Contains(t, err.Error(), `"mirror" failed`)
|
assert.Contains(t, err.Error(), `"mirror" failed`)
|
||||||
|
|
||||||
@@ -317,7 +338,7 @@ func TestProjectCreate_InfraCommitFails(t *testing.T) {
|
|||||||
}
|
}
|
||||||
skill, _ := newSkill(t, f)
|
skill, _ := newSkill(t, f)
|
||||||
|
|
||||||
out, err := skill.Handle(context.Background(), "project_create", happyArgs())
|
out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
|
||||||
require.Error(t, err)
|
require.Error(t, err)
|
||||||
|
|
||||||
var res map[string]any
|
var res map[string]any
|
||||||
@@ -351,6 +372,45 @@ func TestProjectCreate_ValidationErrors(t *testing.T) {
|
|||||||
assert.Empty(t, f.Calls, "no upstream calls should occur on validation failure")
|
assert.Empty(t, f.Calls, "no upstream calls should occur on validation failure")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestProjectCreate_DefaultSkipsGitHubMirror(t *testing.T) {
|
||||||
|
// Default (mirror_to_github omitted) skips create_github_repo + mirror
|
||||||
|
// per the Gitea-as-true-master ADR. Gitea repo + staging namespace
|
||||||
|
// + issue still run; github_url is empty in the response.
|
||||||
|
f := &fakeGiteaMCP{
|
||||||
|
Responses: map[string]any{
|
||||||
|
"issue_create": map[string]any{"html_url": "http://gitea.d-ma.be/mathias/my-experiment/issues/1"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
skill, gh := newSkill(t, f)
|
||||||
|
|
||||||
|
out, err := skill.Handle(context.Background(), "project_create", happyArgs())
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
var res map[string]any
|
||||||
|
require.NoError(t, json.Unmarshal(out, &res))
|
||||||
|
|
||||||
|
assert.Equal(t, "http://gitea.d-ma.be/mathias/my-experiment", res["gitea_url"])
|
||||||
|
assert.Equal(t, "", res["github_url"], "github_url must be empty when mirror not opted in")
|
||||||
|
assert.Equal(t, "http://gitea.d-ma.be/mathias/my-experiment/issues/1", res["issue_url"])
|
||||||
|
|
||||||
|
// 3 gitea-mcp calls: template create, staging file write, issue. NO mirror call.
|
||||||
|
require.Len(t, f.Calls, 3)
|
||||||
|
assert.Equal(t, "create_project_from_template", f.Calls[0].Tool)
|
||||||
|
assert.Equal(t, "file_write_branch", f.Calls[1].Tool)
|
||||||
|
assert.Equal(t, "issue_create", f.Calls[2].Tool)
|
||||||
|
|
||||||
|
// Zero GitHub API calls.
|
||||||
|
assert.Empty(t, gh.Calls, "no GitHub repo created when mirror_to_github is false")
|
||||||
|
|
||||||
|
// reached lists the Gitea-only path.
|
||||||
|
reached := res["reached"].([]any)
|
||||||
|
assert.Equal(t, []any{"create_repo", "infra_commit", "issue"}, reached)
|
||||||
|
|
||||||
|
// experiment-brief body reflects Gitea-only provisioning.
|
||||||
|
require.Contains(t, f.Calls[2].Args["body"], "Gitea-only")
|
||||||
|
require.NotContains(t, f.Calls[2].Args["body"], "Push-mirror configured")
|
||||||
|
}
|
||||||
|
|
||||||
func TestProjectCreate_UnknownTool(t *testing.T) {
|
func TestProjectCreate_UnknownTool(t *testing.T) {
|
||||||
f := &fakeGiteaMCP{}
|
f := &fakeGiteaMCP{}
|
||||||
skill, _ := newSkill(t, f)
|
skill, _ := newSkill(t, f)
|
||||||
|
|||||||
@@ -79,13 +79,22 @@ func (s *Skill) Tools() []registry.ToolDef {
|
|||||||
"description": "Selects template-go-agent or template-go-web.",
|
"description": "Selects template-go-agent or template-go-web.",
|
||||||
},
|
},
|
||||||
"private": map[string]any{"type": "boolean"},
|
"private": map[string]any{"type": "boolean"},
|
||||||
|
"mirror_to_github": map[string]any{
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Default false. When true, also create an empty GitHub repo " +
|
||||||
|
"and configure a push-mirror from Gitea. Opt-in per the Gitea-as-true-master " +
|
||||||
|
"ADR — only set true for open-source projects (hyperguild, gitea-mcp, template-*). " +
|
||||||
|
"Never set true for client projects, business logic, or personal experiments.",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"required": []string{"name", "description", "hypothesis", "stack"},
|
"required": []string{"name", "description", "hypothesis", "stack"},
|
||||||
})
|
})
|
||||||
return []registry.ToolDef{
|
return []registry.ToolDef{
|
||||||
{
|
{
|
||||||
Name: "project_create",
|
Name: "project_create",
|
||||||
Description: "Bootstrap a new project: Gitea repo from template, GitHub push-mirror, staging namespace manifest, experiment-brief issue. Idempotent — re-running with an existing repo returns the existing URLs.",
|
Description: "Bootstrap a new project: Gitea repo from template, staging namespace manifest, " +
|
||||||
|
"experiment-brief issue. Optionally mirrors to GitHub when `mirror_to_github: true` " +
|
||||||
|
"(default false). Idempotent — re-running with an existing repo returns the existing URLs.",
|
||||||
InputSchema: schema,
|
InputSchema: schema,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user