Files
hyperguild/ingestion/internal/pipeline/pipeline_test.go
Mathias Bergqvist 103f4d90bf feat(ingestion): add pipeline orchestrator with prompt builder
Adds prompt.go (BuildPrompt + systemPrompt) and pipeline.go (Run, Config,
Result, mergeAll) that wire chunking, LLM calls, parse, merge, index rebuild,
and log append into a single ingestion pipeline. Includes integration tests
covering write, dry-run, and duplicate-path merge scenarios.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-22 22:45:19 +02:00

131 lines
4.7 KiB
Go

// ingestion/internal/pipeline/pipeline_test.go
package pipeline
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/mathiasbq/hyperguild/ingestion/internal/llm"
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
)
func TestRun_WritesPages(t *testing.T) {
brainDir := t.TempDir()
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
}
llmResponse := mustJSON([]wiki.Page{
{
Path: "wiki/sources/test-article.md",
Content: "---\ntitle: Test Article\ntype: article\ndomain: software-engineering\ndate_ingested: 2026-04-22\nlast_updated: 2026-04-22\naliases:\n - Test Article\n---\n\n## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n## Entities Mentioned\n\n## Open Questions Raised\n",
},
{
Path: "wiki/concepts/testing.md",
Content: "---\ntitle: Testing\ndomain: software-engineering\nlast_updated: 2026-04-22\naliases:\n - Testing\n---\n\n## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n",
},
})
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]any{
"choices": []map[string]any{
{"message": map[string]any{"role": "assistant", "content": llmResponse}},
},
})
}))
defer srv.Close()
cfg := Config{
Complete: llm.New(srv.URL, "", "test-model", 30*time.Second).Complete,
ChunkSize: 0,
}
result, err := Run(context.Background(), cfg, brainDir, "An article about testing.", "test-article", false)
require.NoError(t, err)
assert.Len(t, result.Pages, 2)
assert.Empty(t, result.Warnings)
_, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "test-article.md"))
require.NoError(t, err)
_, err = os.Stat(filepath.Join(brainDir, "wiki", "concepts", "testing.md"))
require.NoError(t, err)
_, err = os.Stat(filepath.Join(brainDir, "wiki", "index.md"))
require.NoError(t, err)
_, err = os.Stat(filepath.Join(brainDir, "log.md"))
require.NoError(t, err)
}
func TestRun_DryRunDoesNotWrite(t *testing.T) {
brainDir := t.TempDir()
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
}
llmResponse := mustJSON([]wiki.Page{{
Path: "wiki/sources/foo.md",
Content: "---\ntitle: Foo\n---\n\n## Summary\n\nFoo.\n",
}})
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
json.NewEncoder(w).Encode(map[string]any{
"choices": []map[string]any{{"message": map[string]any{"content": llmResponse}}},
})
}))
defer srv.Close()
cfg := Config{Complete: llm.New(srv.URL, "", "m", 30*time.Second).Complete}
result, err := Run(context.Background(), cfg, brainDir, "foo content", "foo", true)
require.NoError(t, err)
assert.Len(t, result.Pages, 1)
_, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "foo.md"))
assert.True(t, os.IsNotExist(err))
}
func TestRun_MergesDuplicatePaths(t *testing.T) {
brainDir := t.TempDir()
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
}
// LLM returns same path twice (simulates multi-chunk merge)
llmResponse := mustJSON([]wiki.Page{
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nFirst.\n\n## Related Concepts\n\n- [[bar|Bar]]\n"},
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nSecond.\n\n## Related Concepts\n\n- [[baz|Baz]]\n"},
})
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
json.NewEncoder(w).Encode(map[string]any{
"choices": []map[string]any{{"message": map[string]any{"content": llmResponse}}},
})
}))
defer srv.Close()
cfg := Config{Complete: llm.New(srv.URL, "", "m", 30*time.Second).Complete}
result, err := Run(context.Background(), cfg, brainDir, "content", "foo", false)
require.NoError(t, err)
assert.Len(t, result.Pages, 1) // deduplicated
content, err := os.ReadFile(filepath.Join(brainDir, "wiki", "concepts", "foo.md"))
require.NoError(t, err)
// keep-first for Definition, union for Related Concepts
assert.Contains(t, string(content), "First.")
assert.Contains(t, string(content), "[[bar|Bar]]")
assert.Contains(t, string(content), "[[baz|Baz]]")
}
func mustJSON(v any) string {
b, _ := json.Marshal(v)
return string(b)
}