feat(ingestion): add pipeline orchestrator with prompt builder
Adds prompt.go (BuildPrompt + systemPrompt) and pipeline.go (Run, Config, Result, mergeAll) that wire chunking, LLM calls, parse, merge, index rebuild, and log append into a single ingestion pipeline. Includes integration tests covering write, dry-run, and duplicate-path merge scenarios. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
130
ingestion/internal/pipeline/pipeline_test.go
Normal file
130
ingestion/internal/pipeline/pipeline_test.go
Normal file
@@ -0,0 +1,130 @@
|
||||
// ingestion/internal/pipeline/pipeline_test.go
|
||||
package pipeline
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/mathiasbq/hyperguild/ingestion/internal/llm"
|
||||
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||
)
|
||||
|
||||
func TestRun_WritesPages(t *testing.T) {
|
||||
brainDir := t.TempDir()
|
||||
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
||||
}
|
||||
|
||||
llmResponse := mustJSON([]wiki.Page{
|
||||
{
|
||||
Path: "wiki/sources/test-article.md",
|
||||
Content: "---\ntitle: Test Article\ntype: article\ndomain: software-engineering\ndate_ingested: 2026-04-22\nlast_updated: 2026-04-22\naliases:\n - Test Article\n---\n\n## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n## Entities Mentioned\n\n## Open Questions Raised\n",
|
||||
},
|
||||
{
|
||||
Path: "wiki/concepts/testing.md",
|
||||
Content: "---\ntitle: Testing\ndomain: software-engineering\nlast_updated: 2026-04-22\naliases:\n - Testing\n---\n\n## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n",
|
||||
},
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]any{
|
||||
"choices": []map[string]any{
|
||||
{"message": map[string]any{"role": "assistant", "content": llmResponse}},
|
||||
},
|
||||
})
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
cfg := Config{
|
||||
Complete: llm.New(srv.URL, "", "test-model", 30*time.Second).Complete,
|
||||
ChunkSize: 0,
|
||||
}
|
||||
|
||||
result, err := Run(context.Background(), cfg, brainDir, "An article about testing.", "test-article", false)
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, result.Pages, 2)
|
||||
assert.Empty(t, result.Warnings)
|
||||
|
||||
_, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "test-article.md"))
|
||||
require.NoError(t, err)
|
||||
_, err = os.Stat(filepath.Join(brainDir, "wiki", "concepts", "testing.md"))
|
||||
require.NoError(t, err)
|
||||
_, err = os.Stat(filepath.Join(brainDir, "wiki", "index.md"))
|
||||
require.NoError(t, err)
|
||||
_, err = os.Stat(filepath.Join(brainDir, "log.md"))
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func TestRun_DryRunDoesNotWrite(t *testing.T) {
|
||||
brainDir := t.TempDir()
|
||||
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
||||
}
|
||||
|
||||
llmResponse := mustJSON([]wiki.Page{{
|
||||
Path: "wiki/sources/foo.md",
|
||||
Content: "---\ntitle: Foo\n---\n\n## Summary\n\nFoo.\n",
|
||||
}})
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewEncoder(w).Encode(map[string]any{
|
||||
"choices": []map[string]any{{"message": map[string]any{"content": llmResponse}}},
|
||||
})
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
cfg := Config{Complete: llm.New(srv.URL, "", "m", 30*time.Second).Complete}
|
||||
result, err := Run(context.Background(), cfg, brainDir, "foo content", "foo", true)
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, result.Pages, 1)
|
||||
|
||||
_, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "foo.md"))
|
||||
assert.True(t, os.IsNotExist(err))
|
||||
}
|
||||
|
||||
func TestRun_MergesDuplicatePaths(t *testing.T) {
|
||||
brainDir := t.TempDir()
|
||||
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
||||
}
|
||||
|
||||
// LLM returns same path twice (simulates multi-chunk merge)
|
||||
llmResponse := mustJSON([]wiki.Page{
|
||||
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nFirst.\n\n## Related Concepts\n\n- [[bar|Bar]]\n"},
|
||||
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nSecond.\n\n## Related Concepts\n\n- [[baz|Baz]]\n"},
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewEncoder(w).Encode(map[string]any{
|
||||
"choices": []map[string]any{{"message": map[string]any{"content": llmResponse}}},
|
||||
})
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
cfg := Config{Complete: llm.New(srv.URL, "", "m", 30*time.Second).Complete}
|
||||
result, err := Run(context.Background(), cfg, brainDir, "content", "foo", false)
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, result.Pages, 1) // deduplicated
|
||||
|
||||
content, err := os.ReadFile(filepath.Join(brainDir, "wiki", "concepts", "foo.md"))
|
||||
require.NoError(t, err)
|
||||
// keep-first for Definition, union for Related Concepts
|
||||
assert.Contains(t, string(content), "First.")
|
||||
assert.Contains(t, string(content), "[[bar|Bar]]")
|
||||
assert.Contains(t, string(content), "[[baz|Baz]]")
|
||||
}
|
||||
|
||||
func mustJSON(v any) string {
|
||||
b, _ := json.Marshal(v)
|
||||
return string(b)
|
||||
}
|
||||
Reference in New Issue
Block a user