feat(project_create): mirror_to_github opt-in, default false (infra#34 ADR)

Per the Gitea-as-true-master ADR (infra#34), GitHub mirror is now an explicit opt-in via mirror_to_github=true. Default (omit / false) provisions a Gitea repo + staging namespace + experiment-brief issue only — no GitHub repo, no push-mirror. Rationale: US cloud providers (Microsoft/GitHub) are subject to CLOUD Act and NSL. Client code, business logic, and infra-adjacent repos should never live on US-owned infrastructure. Only open-source projects intended for public community (hyperguild, gitea-mcp, template-*) should opt in. Changes - internal/skills/project/handlers.go - createArgs gains MirrorToGitHub bool (json:"mirror_to_github,omitempty"). - res.GitHubURL is set only when MirrorToGitHub is true; empty string otherwise. - Steps 2 (create_github_repo) + 3 (mirror) are wrapped in `if args.MirrorToGitHub`. - experimentBrief renders "Gitea-only" line by default and the existing "Push-mirror configured" line only on opt-in. - internal/skills/project/skill.go - Tool schema gains mirror_to_github (boolean, default false) with description spelling out when to opt in. Tool Description updated to reflect new default. - internal/skills/project/handlers_test.go - Added mirroredArgs() helper (happyArgs + mirror_to_github:true). - Tests that exercise the GitHub flow (HappyPath, GitHubExists_Idempotent, GitHubFails, NoGitHubClient_DegradedMode, Idempotent_RepoExists, MirrorFails, InfraCommitFails) switched to mirroredArgs. - Added TestProjectCreate_DefaultSkipsGitHubMirror covering the Gitea-only path: 3 gitea-mcp calls, zero GitHub calls, empty github_url, reached= [create_repo, infra_commit, issue], body reflects Gitea-only. Closes gitea/mathias/hyperguild#17. Moves infra#34 acceptance item "project_create updated: mirror_to_github defaults to false".
feat(ingestion): chunk markdown before embedding (#38 )
2026-05-20 08:35:02 +02:00 · 2026-05-19 21:57:09 +02:00
9 changed files with 474 additions and 63 deletions
--- a/ingestion/internal/search/search.go
+++ b/ingestion/internal/search/search.go
@@ -12,6 +12,7 @@ import (
 	"strings"

 	"github.com/mathiasbq/hyperguild/ingestion/internal/brain"
+	"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
 )

 // VectorSearcher returns the top-limit nearest paths by cosine
@@ -186,17 +187,21 @@ func hybridMerge(ctx context.Context, brainDir string, opts QueryOptions, bm25 [
 		byPath[r.Path] = r
 	}
 	for rank, h := range hits {
-		if opts.Wing != "" && !pathInScope(h.Path, opts.Wing, opts.Hall) {
+		// Vector store keys are chunk paths ("wiki/foo.md#0001"); collapse
+		// back to the parent so multiple chunk hits from the same file
+		// score against a single result row.
+		parent := vectorstore.ParentPath(h.Path)
+		if opts.Wing != "" && !pathInScope(parent, opts.Wing, opts.Hall) {
 			continue
 		}
-		rrf[h.Path] += 1.0 / (rrfK + float64(rank+1))
-		if _, seen := byPath[h.Path]; !seen {
-			r, err := hydrate(brainDir, h.Path)
+		rrf[parent] += 1.0 / (rrfK + float64(rank+1))
+		if _, seen := byPath[parent]; !seen {
+			r, err := hydrate(brainDir, parent)
 			if err != nil {
-				slog.Warn("search: hydrate failed for vector hit", "path", h.Path, "err", err)
+				slog.Warn("search: hydrate failed for vector hit", "path", parent, "err", err)
 				continue
 			}
-			byPath[h.Path] = r
+			byPath[parent] = r
 		}
 	}

--- a/ingestion/internal/search/search_test.go
+++ b/ingestion/internal/search/search_test.go
@@ -55,6 +55,36 @@ func TestSearch_HybridRRFPromotesVectorOnlyHit(t *testing.T) {
 	assert.Contains(t, paths, "wiki/jepa-fx/facts/semantic.md")
 }

+func TestSearch_HybridDedupesChunkPathsToParent(t *testing.T) {
+	dir := t.TempDir()
+	full := filepath.Join(dir, "knowledge", "long.md")
+	require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
+	// Body contains the BM25 keyword "alpaca" so hybridMerge actually runs
+	// (it only kicks in when BM25 returns at least one candidate).
+	require.NoError(t, os.WriteFile(full, []byte("---\ntitle: Long\n---\nalpaca content.\n"), 0o644))
+
+	embedder := stubEmbedder{vec: []float32{0.1}}
+	// Vector store returns three chunk-path hits all pointing at the same
+	// parent file. The merged result must surface ONE row per parent — not
+	// three rows with chunk-suffixed paths.
+	vector := stubVector{hits: []search.VectorHit{
+		{Path: "knowledge/long.md#0001", Distance: 0.05},
+		{Path: "knowledge/long.md#0002", Distance: 0.07},
+		{Path: "knowledge/long.md#0003", Distance: 0.09},
+	}}
+
+	got, err := search.Query(dir, search.QueryOptions{
+		Query:    "alpaca",
+		Limit:    5,
+		Vector:   vector,
+		Embedder: embedder,
+	})
+	require.NoError(t, err)
+	require.Len(t, got, 1, "three chunk hits for one parent must merge to one result")
+	assert.Equal(t, "knowledge/long.md", got[0].Path)
+	assert.Equal(t, "Long", got[0].Title)
+}
+
 func TestSearch_HybridFallsBackOnEmbedderError(t *testing.T) {
 	dir := t.TempDir()
 	require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
--- a/ingestion/internal/vectorstore/chunk.go
+++ b/ingestion/internal/vectorstore/chunk.go
@@ -0,0 +1,137 @@
+package vectorstore
+
+import (
+	"fmt"
+	"strings"
+)
+
+// NumberedChunk pairs a chunk's body with the storage path it will use
+// in brain_embeddings. Path format: "<parent>#NNNN" where NNNN is the
+// 1-based chunk index zero-padded to 4 digits.
+type NumberedChunk struct {
+	Path    string
+	Content string
+}
+
+// ParentPath returns the file path with any "#NNNN" chunk suffix removed.
+// Inputs without a "#" are returned unchanged. Used by search to dedupe
+// chunk-level hits back to a single document per result.
+func ParentPath(p string) string {
+	if i := strings.Index(p, "#"); i >= 0 {
+		return p[:i]
+	}
+	return p
+}
+
+// NumberChunks assigns "<parent>#NNNN" storage paths to a slice of chunk
+// bodies, indexed from 0001. Empty chunks are dropped.
+func NumberChunks(parent string, chunks []string) []NumberedChunk {
+	out := make([]NumberedChunk, 0, len(chunks))
+	idx := 1
+	for _, c := range chunks {
+		if strings.TrimSpace(c) == "" {
+			continue
+		}
+		out = append(out, NumberedChunk{
+			Path:    fmt.Sprintf("%s#%04d", parent, idx),
+			Content: c,
+		})
+		idx++
+	}
+	return out
+}
+
+// ChunkMarkdown splits a markdown document into embedding-sized pieces.
+// Strategy:
+//  1. Split at H1/H2 headings (top-of-line "#" or "##"). The intro before
+//     the first heading is its own chunk.
+//  2. Any section larger than maxBytes is further split at paragraph
+//     boundaries (blank lines), packing paragraphs greedily under the
+//     byte budget.
+//
+// The function aims for "fits comfortably under nomic-embed-text's 2048-
+// token context" — at ~4 chars/token for English markdown, maxBytes ≈ 4000
+// is a safe call-site default.
+func ChunkMarkdown(content string, maxBytes int) []string {
+	if maxBytes <= 0 {
+		maxBytes = 4000
+	}
+	sections := splitAtHeadings(content)
+
+	out := make([]string, 0, len(sections))
+	for _, s := range sections {
+		if len(s) <= maxBytes {
+			out = append(out, s)
+			continue
+		}
+		out = append(out, splitAtParagraphs(s, maxBytes)...)
+	}
+	return out
+}
+
+// splitAtHeadings cuts content into sections that each start with an
+// "# " or "## " line (intro before any heading is the leading section).
+func splitAtHeadings(content string) []string {
+	lines := strings.Split(content, "\n")
+	var sections []string
+	var cur strings.Builder
+	flush := func() {
+		if cur.Len() == 0 {
+			return
+		}
+		// Trim all trailing whitespace then re-add a single newline so a
+		// single-paragraph file round-trips to its original content rather
+		// than accumulating extra newlines from the empty-line split.
+		s := strings.TrimRight(cur.String(), "\n")
+		sections = append(sections, s+"\n")
+		cur.Reset()
+	}
+	for _, ln := range lines {
+		trimmed := strings.TrimLeft(ln, " ")
+		isH := strings.HasPrefix(trimmed, "# ") || strings.HasPrefix(trimmed, "## ")
+		if isH && cur.Len() > 0 {
+			flush()
+		}
+		cur.WriteString(ln)
+		cur.WriteByte('\n')
+	}
+	flush()
+	// Drop empty / whitespace-only trailing section (common when content
+	// itself ends with a "\n" — Split leaves a final empty element).
+	if n := len(sections); n > 0 && strings.TrimSpace(sections[n-1]) == "" {
+		sections = sections[:n-1]
+	}
+	return sections
+}
+
+// splitAtParagraphs packs paragraphs (blank-line separated blocks) into
+// sub-chunks of at most maxBytes. A single paragraph that itself exceeds
+// maxBytes is emitted as one over-budget chunk rather than being split
+// mid-sentence — better to over-spend a little than truncate prose.
+func splitAtParagraphs(section string, maxBytes int) []string {
+	paras := strings.Split(section, "\n\n")
+	var out []string
+	var cur strings.Builder
+	for _, p := range paras {
+		if p == "" {
+			continue
+		}
+		// +2 for the "\n\n" rejoin if cur isn't empty
+		need := len(p)
+		if cur.Len() > 0 {
+			need += 2
+		}
+		if cur.Len() > 0 && cur.Len()+need > maxBytes {
+			out = append(out, cur.String())
+			cur.Reset()
+		}
+		if cur.Len() > 0 {
+			cur.WriteString("\n\n")
+		}
+		cur.WriteString(p)
+	}
+	if cur.Len() > 0 {
+		out = append(out, cur.String())
+	}
+	return out
+}
--- a/ingestion/internal/vectorstore/chunk_test.go
+++ b/ingestion/internal/vectorstore/chunk_test.go
@@ -0,0 +1,72 @@
+package vectorstore_test
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestChunkMarkdown_ShortFileFitsInOne(t *testing.T) {
+	out := vectorstore.ChunkMarkdown("Just a short paragraph.\n", 4000)
+	require.Len(t, out, 1)
+	assert.Equal(t, "Just a short paragraph.\n", out[0])
+}
+
+func TestChunkMarkdown_SplitsAtHeadings(t *testing.T) {
+	src := "# Top\n\nintro\n\n## A\n\nbody a\n\n## B\n\nbody b\n"
+	out := vectorstore.ChunkMarkdown(src, 50) // tiny limit forces per-section split
+
+	assert.GreaterOrEqual(t, len(out), 2, "should split at H2 boundaries")
+	// Each chunk should start with a heading (top-level intro chunk OK without one)
+	for i, c := range out {
+		if i == 0 {
+			continue
+		}
+		assert.True(t, strings.HasPrefix(strings.TrimSpace(c), "#"),
+			"non-first chunk %d should start with heading: %q", i, c)
+	}
+}
+
+func TestChunkMarkdown_FurtherSplitsOversizedSection(t *testing.T) {
+	// One H2 section with 4 paragraphs of ~80 chars each, limit 100.
+	src := "## big\n\n" +
+		strings.Repeat("paragraph one is moderately long.\n\n", 1) +
+		strings.Repeat("paragraph two also moderately long.\n\n", 1) +
+		strings.Repeat("paragraph three is moderately long.\n\n", 1) +
+		strings.Repeat("paragraph four is moderately long.\n\n", 1)
+	out := vectorstore.ChunkMarkdown(src, 100)
+
+	assert.Greater(t, len(out), 1, "oversized section should sub-split at paragraph boundaries")
+	for i, c := range out {
+		assert.LessOrEqual(t, len(c), 200,
+			"chunk %d exceeds 2x maxBytes: %d", i, len(c))
+	}
+}
+
+func TestChunkMarkdown_PreservesContent(t *testing.T) {
+	src := "# H1\n\nfirst section body.\n\n## H2a\n\nsecond section body.\n\n## H2b\n\nthird section body.\n"
+	out := vectorstore.ChunkMarkdown(src, 50)
+	joined := strings.Join(out, "")
+	// All non-whitespace tokens from src must appear in the joined output
+	for _, token := range []string{"H1", "first", "H2a", "second", "H2b", "third"} {
+		assert.Contains(t, joined, token, "token %q missing after chunking", token)
+	}
+}
+
+func TestChunkMarkdown_NumberedSuffix(t *testing.T) {
+	out := vectorstore.NumberChunks("knowledge/foo.md", []string{"a", "b", "c"})
+	require.Len(t, out, 3)
+	assert.Equal(t, "knowledge/foo.md#0001", out[0].Path)
+	assert.Equal(t, "knowledge/foo.md#0002", out[1].Path)
+	assert.Equal(t, "knowledge/foo.md#0003", out[2].Path)
+	assert.Equal(t, "a", out[0].Content)
+}
+
+func TestParentPath_StripsChunkSuffix(t *testing.T) {
+	assert.Equal(t, "knowledge/foo.md", vectorstore.ParentPath("knowledge/foo.md#0001"))
+	assert.Equal(t, "knowledge/foo.md", vectorstore.ParentPath("knowledge/foo.md"))
+	assert.Equal(t, "wiki/a/b.md", vectorstore.ParentPath("wiki/a/b.md#9999"))
+}
--- a/ingestion/internal/vectorstore/sync.go
+++ b/ingestion/internal/vectorstore/sync.go
@@ -37,6 +37,13 @@ type SyncResult struct {
 // source pages; knowledge/ holds curated hand-written entries.
 var scanDirs = []string{"wiki", "knowledge"}

+// maxChunkBytes is the per-chunk byte budget passed to ChunkMarkdown.
+// Sized to fit comfortably under nomic-embed-text's 2048-token default
+// context (~4 chars/token for English markdown → ~8 KB ceiling; we sit
+// at 4 KB to leave headroom for unicode, code blocks, and tokenizer
+// variance).
+const maxChunkBytes = 4000
+
 // Sync brings the embedding store in line with brain/{wiki,knowledge}/
 // on disk:
 //   - new files (in the tree, not in the store) get embedded + upserted
@@ -55,7 +62,13 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
 	if err != nil {
 		return res, fmt.Errorf("known paths: %w", err)
 	}
-	seen := make(map[string]struct{})
+	// Build a parent → "any chunk known?" set so we can skip files that
+	// already have at least one chunk row in the store.
+	knownParents := make(map[string]struct{}, len(known))
+	for p := range known {
+		knownParents[ParentPath(p)] = struct{}{}
+	}
+	seenParents := make(map[string]struct{})

 	for _, sub := range scanDirs {
 		root := filepath.Join(brainDir, sub)
@@ -75,11 +88,12 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
 				return err
 			}
 			relSlash := filepath.ToSlash(rel)
-			seen[relSlash] = struct{}{}
+			seenParents[relSlash] = struct{}{}

-			if _, ok := known[relSlash]; ok {
-				// Already embedded — TODO: compare mtime once Store exposes
-				// updated_at so we re-embed on edit. For now, skip.
+			if _, ok := knownParents[relSlash]; ok {
+				// File has at least one chunk in the store already.
+				// TODO: compare mtime once Store exposes updated_at so we
+				// re-embed on edit. For now, skip.
 				return nil
 			}

@@ -88,16 +102,19 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
 				res.Errors = append(res.Errors, fmt.Errorf("read %s: %w", relSlash, readErr))
 				return nil
 			}
-			vec, embErr := embedder.Embed(ctx, string(content))
+			chunks := NumberChunks(relSlash, ChunkMarkdown(string(content), maxChunkBytes))
+			for _, ch := range chunks {
+				vec, embErr := embedder.Embed(ctx, ch.Content)
 				if embErr != nil {
-				res.Errors = append(res.Errors, fmt.Errorf("embed %s: %w", relSlash, embErr))
-				return nil
+					res.Errors = append(res.Errors, fmt.Errorf("embed %s: %w", ch.Path, embErr))
+					continue
 				}
-			if upErr := store.Upsert(ctx, relSlash, vec); upErr != nil {
-				res.Errors = append(res.Errors, fmt.Errorf("upsert %s: %w", relSlash, upErr))
-				return nil
+				if upErr := store.Upsert(ctx, ch.Path, vec); upErr != nil {
+					res.Errors = append(res.Errors, fmt.Errorf("upsert %s: %w", ch.Path, upErr))
+					continue
 				}
 				res.Added++
+			}
 			return nil
 		})
 		if err != nil {
@@ -105,9 +122,9 @@ func Sync(ctx context.Context, brainDir string, store Store, embedder Embedder)
 		}
 	}

-	// Drop rows whose file is gone.
+	// Drop chunk rows whose parent file is gone.
 	for path := range known {
-		if _, ok := seen[path]; ok {
+		if _, ok := seenParents[ParentPath(path)]; ok {
 			continue
 		}
 		if err := store.Delete(ctx, path); err != nil {
--- a/ingestion/internal/vectorstore/sync_test.go
+++ b/ingestion/internal/vectorstore/sync_test.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"

 	"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
@@ -72,15 +73,15 @@ func TestSync_AddsNewFiles(t *testing.T) {
 	require.NoError(t, err)
 	assert.Equal(t, 2, res.Added)
 	assert.Empty(t, res.Deleted)
-	assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/x.md")
-	assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/y.md")
+	assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/x.md#0001")
+	assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/y.md#0001")
 }

 func TestSync_SkipsAlreadyKnown(t *testing.T) {
 	dir := t.TempDir()
 	writeNote(t, dir, "wiki/a/facts/x.md", "x")

-	store := &stubStore{known: map[string]struct{}{"wiki/a/facts/x.md": {}}}
+	store := &stubStore{known: map[string]struct{}{"wiki/a/facts/x.md#0001": {}}}
 	emb := stubEmbedder{vec: make([]float32, 768)}
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
@@ -92,7 +93,7 @@ func TestSync_DeletesDisappearedFiles(t *testing.T) {
 	dir := t.TempDir()
 	require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
 	// store has a path that doesn't exist on disk anymore
-	store := &stubStore{known: map[string]struct{}{"wiki/old/facts/ghost.md": {}}}
+	store := &stubStore{known: map[string]struct{}{"wiki/old/facts/ghost.md#0001": {}}}
 	res, err := vectorstore.Sync(context.Background(), dir, &stubStoreWithDelete{stubStore: store}, stubEmbedder{vec: make([]float32, 768)})
 	require.NoError(t, err)
 	assert.Equal(t, 1, res.Deleted)
@@ -114,7 +115,7 @@ func TestSync_SkipsIndexFiles(t *testing.T) {
 	res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
 	require.NoError(t, err)
 	assert.Equal(t, 1, res.Added)
-	assert.NotContains(t, store.upserts, "wiki/a/_index.md")
+	assert.NotContains(t, store.upserts, "wiki/a/_index.md#0001")
 }

 func TestSync_ScansKnowledgeDir(t *testing.T) {
@@ -127,8 +128,75 @@ func TestSync_ScansKnowledgeDir(t *testing.T) {
 	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
 	require.NoError(t, err)
 	assert.Equal(t, 2, res.Added)
-	assert.Contains(t, store.upserts, "wiki/a/facts/x.md")
-	assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md")
+	assert.Contains(t, store.upserts, "wiki/a/facts/x.md#0001")
+	assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md#0001")
+}
+
+func TestSync_ChunksLongFiles(t *testing.T) {
+	dir := t.TempDir()
+	// Build a file that's well over the chunk byte budget. Multi-section
+	// markdown so the chunker has heading boundaries to cut on.
+	body := "# Doc\n\nintro line.\n\n"
+	for i := 0; i < 10; i++ {
+		body += "## Section " + string(rune('A'+i)) + "\n\n"
+		body += strings.Repeat("This section has a fair amount of content. ", 50) + "\n\n"
+	}
+	writeNote(t, dir, "knowledge/long.md", body)
+
+	store := &stubStore{known: map[string]struct{}{}}
+	emb := stubEmbedder{vec: make([]float32, 768)}
+	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
+	require.NoError(t, err)
+	assert.Greater(t, res.Added, 1, "long file should produce multiple chunk rows")
+	// Every upserted path for this file must be a chunk path.
+	chunkCount := 0
+	for p := range store.upserts {
+		if strings.HasPrefix(p, "knowledge/long.md#") {
+			chunkCount++
+		}
+	}
+	assert.Equal(t, res.Added, chunkCount, "all rows for long file should be chunk-suffixed")
+	// The bare parent path must NOT be upserted directly.
+	assert.NotContains(t, store.upserts, "knowledge/long.md")
+}
+
+func TestSync_ShortFileGetsSingleChunkRow(t *testing.T) {
+	dir := t.TempDir()
+	writeNote(t, dir, "wiki/short.md", "tiny body\n")
+
+	store := &stubStore{known: map[string]struct{}{}}
+	emb := stubEmbedder{vec: make([]float32, 768)}
+	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
+	require.NoError(t, err)
+	assert.Equal(t, 1, res.Added)
+	assert.Contains(t, store.upserts, "wiki/short.md#0001")
+}
+
+func TestSync_SkipsFileIfAnyChunkAlreadyKnown(t *testing.T) {
+	dir := t.TempDir()
+	writeNote(t, dir, "wiki/foo.md", "body\n")
+
+	store := &stubStore{known: map[string]struct{}{
+		"wiki/foo.md#0001": {},
+	}}
+	emb := stubEmbedder{vec: make([]float32, 768)}
+	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
+	require.NoError(t, err)
+	assert.Equal(t, 0, res.Added)
+	assert.Empty(t, store.upserts)
+}
+
+func TestSync_DeletesAllChunksOfDisappearedFile(t *testing.T) {
+	dir := t.TempDir()
+	require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
+	store := &stubStore{known: map[string]struct{}{
+		"wiki/ghost.md#0001": {},
+		"wiki/ghost.md#0002": {},
+		"wiki/ghost.md#0003": {},
+	}}
+	res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
+	require.NoError(t, err)
+	assert.Equal(t, 3, res.Deleted)
 }

 func TestSync_NoOpWhenComponentsNil(t *testing.T) {
--- a/internal/skills/project/handlers.go
+++ b/internal/skills/project/handlers.go
@@ -19,6 +19,7 @@ type createArgs struct {
 	Folder         string `json:"folder"`
 	Stack          string `json:"stack"`
 	Private        bool   `json:"private"`
+	MirrorToGitHub bool   `json:"mirror_to_github,omitempty"`
 }

 type createResult struct {
@@ -59,11 +60,12 @@ func (s *Skill) handleCreate(ctx context.Context, raw json.RawMessage) (json.Raw

 	tmpl := templateFor(args.Stack)
 	giteaURL := fmt.Sprintf("http://gitea.d-ma.be/%s/%s", s.cfg.GiteaOwner, args.Name)
-	githubURL := fmt.Sprintf("https://github.com/%s/%s", s.cfg.GitHubOwner, args.Name)

 	res := createResult{
 		GiteaURL: giteaURL,
-		GitHubURL: githubURL,
+	}
+	if args.MirrorToGitHub {
+		res.GitHubURL = fmt.Sprintf("https://github.com/%s/%s", s.cfg.GitHubOwner, args.Name)
 	}

 	// Step 1: create_project_from_template. If the repo already exists,
@@ -75,6 +77,12 @@ func (s *Skill) handleCreate(ctx context.Context, raw json.RawMessage) (json.Raw
 	}
 	res.Reached = append(res.Reached, stepCreateRepo)

+	// Steps 2+3 are skipped when MirrorToGitHub is false. Default per
+	// infra ADR (Gitea as true master, GitHub as optional opt-in): keep
+	// client / business-logic / personal repos Gitea-only. Set
+	// `mirror_to_github: true` for open-source projects that want a
+	// public GitHub mirror (hyperguild, gitea-mcp, template-*).
+	if args.MirrorToGitHub {
 		// Step 2: create empty GitHub repo. Gitea's push-mirror cannot push
 		// to a non-existent remote, so the destination must exist before
 		// step 3 configures the mirror. Skipped when GitHub client is unset
@@ -94,6 +102,7 @@ func (s *Skill) handleCreate(ctx context.Context, raw json.RawMessage) (json.Raw
 			}
 		}
 		res.Reached = append(res.Reached, stepMirror)
+	}

 	// Step 3: commit staging namespace manifest to infra repo. Done before
 	// the issue so the staging env is reconciling by the time the issue lands.
@@ -228,7 +237,11 @@ func experimentBrief(args createArgs, existed bool) string {
 	b.WriteString("- Repo created from `template-")
 	b.WriteString(args.Stack)
 	b.WriteString("` on Gitea.\n")
+	if args.MirrorToGitHub {
 		b.WriteString("- Push-mirror configured to GitHub.\n")
+	} else {
+		b.WriteString("- Gitea-only (no GitHub mirror — set `mirror_to_github: true` to opt in).\n")
+	}
 	b.WriteString("- Staging namespace manifest committed to infra repo.\n\n")
 	if existed {
 		b.WriteString("> Note: this repo already existed when `project_create` ran — provisioning steps were re-applied idempotently.\n")
--- a/internal/skills/project/handlers_test.go
+++ b/internal/skills/project/handlers_test.go
@@ -158,6 +158,9 @@ func mustClient(t *testing.T, url string) *mcpclient.Client {
 	return c
 }

+// happyArgs returns the minimal valid request. With the Gitea-as-true-master
+// ADR shipped, this defaults to Gitea-only (mirror_to_github omitted = false).
+// Tests that need the full Gitea + GitHub mirror flow use mirroredArgs().
 func happyArgs() json.RawMessage {
 	return json.RawMessage(`{
 		"name":"my-experiment",
@@ -169,6 +172,20 @@ func happyArgs() json.RawMessage {
 	}`)
 }

+// mirroredArgs is happyArgs + mirror_to_github=true — the explicit opt-in
+// path. Equivalent to the pre-ADR default.
+func mirroredArgs() json.RawMessage {
+	return json.RawMessage(`{
+		"name":"my-experiment",
+		"description":"One-line desc",
+		"hypothesis":"We believe X produces Y",
+		"folder":"AGENTS",
+		"stack":"go-agent",
+		"private":true,
+		"mirror_to_github":true
+	}`)
+}
+
 func TestProjectCreate_HappyPath(t *testing.T) {
 	f := &fakeGiteaMCP{
 		Responses: map[string]any{
@@ -177,7 +194,7 @@ func TestProjectCreate_HappyPath(t *testing.T) {
 	}
 	skill, gh := newSkill(t, f)

-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.NoError(t, err)

 	var res map[string]any
@@ -228,7 +245,7 @@ func TestProjectCreate_GitHubExists_Idempotent(t *testing.T) {
 	skill, gh := newSkill(t, f)
 	gh.ReturnError = 422 // already exists

-	_, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	_, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.NoError(t, err, "422 already-exists should be idempotent")
 	require.Len(t, f.Calls, 4, "all gitea steps still run despite github 422")
 }
@@ -238,7 +255,7 @@ func TestProjectCreate_GitHubFails(t *testing.T) {
 	skill, gh := newSkill(t, f)
 	gh.ReturnError = 401 // bad PAT

-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.Error(t, err)
 	var res map[string]any
 	require.NoError(t, json.Unmarshal(out, &res))
@@ -255,7 +272,11 @@ func TestProjectCreate_NoGitHubClient_DegradedMode(t *testing.T) {
 	}
 	skill := newSkillNoGitHub(t, f)

-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	// Use mirroredArgs so we exercise the GitHub-mirror path. With the
+	// GitHub client nil, the create_github_repo step is skipped but the
+	// mirror step still attempts to configure the push-mirror remote
+	// (degraded mode preserves the prior contract for opted-in projects).
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.NoError(t, err)
 	var res map[string]any
 	require.NoError(t, json.Unmarshal(out, &res))
@@ -275,7 +296,7 @@ func TestProjectCreate_Idempotent_RepoExists(t *testing.T) {
 	}
 	skill, _ := newSkill(t, f)

-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.NoError(t, err)

 	var res map[string]any
@@ -295,7 +316,7 @@ func TestProjectCreate_MirrorFails(t *testing.T) {
 	}
 	skill, _ := newSkill(t, f)

-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.Error(t, err)
 	assert.Contains(t, err.Error(), `"mirror" failed`)

@@ -317,7 +338,7 @@ func TestProjectCreate_InfraCommitFails(t *testing.T) {
 	}
 	skill, _ := newSkill(t, f)

-	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	out, err := skill.Handle(context.Background(), "project_create", mirroredArgs())
 	require.Error(t, err)

 	var res map[string]any
@@ -351,6 +372,45 @@ func TestProjectCreate_ValidationErrors(t *testing.T) {
 	assert.Empty(t, f.Calls, "no upstream calls should occur on validation failure")
 }

+func TestProjectCreate_DefaultSkipsGitHubMirror(t *testing.T) {
+	// Default (mirror_to_github omitted) skips create_github_repo + mirror
+	// per the Gitea-as-true-master ADR. Gitea repo + staging namespace
+	// + issue still run; github_url is empty in the response.
+	f := &fakeGiteaMCP{
+		Responses: map[string]any{
+			"issue_create": map[string]any{"html_url": "http://gitea.d-ma.be/mathias/my-experiment/issues/1"},
+		},
+	}
+	skill, gh := newSkill(t, f)
+
+	out, err := skill.Handle(context.Background(), "project_create", happyArgs())
+	require.NoError(t, err)
+
+	var res map[string]any
+	require.NoError(t, json.Unmarshal(out, &res))
+
+	assert.Equal(t, "http://gitea.d-ma.be/mathias/my-experiment", res["gitea_url"])
+	assert.Equal(t, "", res["github_url"], "github_url must be empty when mirror not opted in")
+	assert.Equal(t, "http://gitea.d-ma.be/mathias/my-experiment/issues/1", res["issue_url"])
+
+	// 3 gitea-mcp calls: template create, staging file write, issue. NO mirror call.
+	require.Len(t, f.Calls, 3)
+	assert.Equal(t, "create_project_from_template", f.Calls[0].Tool)
+	assert.Equal(t, "file_write_branch", f.Calls[1].Tool)
+	assert.Equal(t, "issue_create", f.Calls[2].Tool)
+
+	// Zero GitHub API calls.
+	assert.Empty(t, gh.Calls, "no GitHub repo created when mirror_to_github is false")
+
+	// reached lists the Gitea-only path.
+	reached := res["reached"].([]any)
+	assert.Equal(t, []any{"create_repo", "infra_commit", "issue"}, reached)
+
+	// experiment-brief body reflects Gitea-only provisioning.
+	require.Contains(t, f.Calls[2].Args["body"], "Gitea-only")
+	require.NotContains(t, f.Calls[2].Args["body"], "Push-mirror configured")
+}
+
 func TestProjectCreate_UnknownTool(t *testing.T) {
 	f := &fakeGiteaMCP{}
 	skill, _ := newSkill(t, f)
--- a/internal/skills/project/skill.go
+++ b/internal/skills/project/skill.go
@@ -79,13 +79,22 @@ func (s *Skill) Tools() []registry.ToolDef {
 				"description": "Selects template-go-agent or template-go-web.",
 			},
 			"private": map[string]any{"type": "boolean"},
+			"mirror_to_github": map[string]any{
+				"type": "boolean",
+				"description": "Default false. When true, also create an empty GitHub repo " +
+					"and configure a push-mirror from Gitea. Opt-in per the Gitea-as-true-master " +
+					"ADR — only set true for open-source projects (hyperguild, gitea-mcp, template-*). " +
+					"Never set true for client projects, business logic, or personal experiments.",
+			},
 		},
 		"required": []string{"name", "description", "hypothesis", "stack"},
 	})
 	return []registry.ToolDef{
 		{
 			Name: "project_create",
-			Description: "Bootstrap a new project: Gitea repo from template, GitHub push-mirror, staging namespace manifest, experiment-brief issue. Idempotent — re-running with an existing repo returns the existing URLs.",
+			Description: "Bootstrap a new project: Gitea repo from template, staging namespace manifest, " +
+				"experiment-brief issue. Optionally mirrors to GitHub when `mirror_to_github: true` " +
+				"(default false). Idempotent — re-running with an existing repo returns the existing URLs.",
 			InputSchema: schema,
 		},
 	}