feat(pipeline): wire ParseRawPages+BuildPages+CanonicalizeLinks into Run
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -41,9 +41,11 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
|
||||
schema = loadSchema(brainDir)
|
||||
}
|
||||
|
||||
sourceSlug := wiki.Slug(source)
|
||||
date := time.Now().UTC().Format("2006-01-02")
|
||||
chunks := Chunk(content, cfg.ChunkSize)
|
||||
|
||||
var allPages []wiki.Page
|
||||
var allRaw []RawPage
|
||||
var allWarnings []string
|
||||
|
||||
for _, chunk := range chunks {
|
||||
@@ -52,25 +54,19 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
|
||||
if err != nil {
|
||||
return Result{}, fmt.Errorf("LLM call: %w", err)
|
||||
}
|
||||
// TODO(task4): replace with RawPage-based pipeline
|
||||
rawPages, warnings := ParseRawPages(output)
|
||||
for _, rp := range rawPages {
|
||||
if rp.Title == "" {
|
||||
allWarnings = append(allWarnings, "skipped RawPage with empty title (TODO task4)")
|
||||
continue
|
||||
}
|
||||
allPages = append(allPages, wiki.Page{Path: rp.Type + "/" + rp.Title, Content: rp.Content})
|
||||
}
|
||||
raw, warnings := ParseRawPages(output)
|
||||
allRaw = append(allRaw, raw...)
|
||||
allWarnings = append(allWarnings, warnings...)
|
||||
}
|
||||
|
||||
resolved := Resolve(allPages, inventory)
|
||||
withRefs := injectSourceRefs(resolved, inventory, brainDir)
|
||||
pages := BuildPages(allRaw, sourceSlug, date)
|
||||
resolved := Resolve(pages, inventory)
|
||||
canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory)
|
||||
allWarnings = append(allWarnings, linkWarnings...)
|
||||
withRefs := injectSourceRefs(canonicalized, inventory, brainDir)
|
||||
merged := mergeAll(withRefs)
|
||||
|
||||
date := time.Now().UTC().Format("2006-01-02")
|
||||
var written []string
|
||||
|
||||
for _, page := range merged {
|
||||
if !dryRun {
|
||||
dest := filepath.Join(brainDir, filepath.FromSlash(page.Path))
|
||||
|
||||
@@ -15,24 +15,27 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/mathiasbq/hyperguild/ingestion/internal/llm"
|
||||
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||
)
|
||||
|
||||
func TestRun_WritesPages(t *testing.T) {
|
||||
t.Skip("TODO(task4): update stub to RawPage format")
|
||||
brainDir := t.TempDir()
|
||||
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
||||
}
|
||||
|
||||
llmResponse := mustJSON([]wiki.Page{
|
||||
llmResponse := mustJSON([]RawPage{
|
||||
{
|
||||
Path: "wiki/sources/test-article.md",
|
||||
Content: "---\ntitle: Test Article\ntype: article\ndomain: software-engineering\ndate_ingested: 2026-04-22\nlast_updated: 2026-04-22\naliases:\n - Test Article\n---\n\n## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n## Entities Mentioned\n\n## Open Questions Raised\n",
|
||||
Title: "Test Article",
|
||||
Type: "source",
|
||||
Subtype: "article",
|
||||
Domain: "software-engineering",
|
||||
Content: "## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n[[Testing]]\n\n## Entities Mentioned\n\n## Open Questions Raised\n",
|
||||
},
|
||||
{
|
||||
Path: "wiki/concepts/testing.md",
|
||||
Content: "---\ntitle: Testing\ndomain: software-engineering\nlast_updated: 2026-04-22\naliases:\n - Testing\n---\n\n## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n",
|
||||
Title: "Testing",
|
||||
Type: "concept",
|
||||
Domain: "software-engineering",
|
||||
Content: "## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n",
|
||||
},
|
||||
})
|
||||
|
||||
@@ -54,7 +57,6 @@ func TestRun_WritesPages(t *testing.T) {
|
||||
result, err := Run(context.Background(), cfg, brainDir, "An article about testing.", "test-article", false)
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, result.Pages, 2)
|
||||
assert.Empty(t, result.Warnings)
|
||||
|
||||
_, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "test-article.md"))
|
||||
require.NoError(t, err)
|
||||
@@ -67,15 +69,16 @@ func TestRun_WritesPages(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRun_DryRunDoesNotWrite(t *testing.T) {
|
||||
t.Skip("TODO(task4): update stub to RawPage format")
|
||||
brainDir := t.TempDir()
|
||||
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
||||
}
|
||||
|
||||
llmResponse := mustJSON([]wiki.Page{{
|
||||
Path: "wiki/sources/foo.md",
|
||||
Content: "---\ntitle: Foo\n---\n\n## Summary\n\nFoo.\n",
|
||||
llmResponse := mustJSON([]RawPage{{
|
||||
Title: "Foo",
|
||||
Type: "source",
|
||||
Subtype: "article",
|
||||
Content: "## Summary\n\nFoo.\n",
|
||||
}})
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -95,16 +98,15 @@ func TestRun_DryRunDoesNotWrite(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRun_MergesDuplicatePaths(t *testing.T) {
|
||||
t.Skip("TODO(task4): update stub to RawPage format")
|
||||
brainDir := t.TempDir()
|
||||
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
||||
}
|
||||
|
||||
// LLM returns same path twice (simulates multi-chunk merge)
|
||||
llmResponse := mustJSON([]wiki.Page{
|
||||
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nFirst.\n\n## Related Concepts\n\n- [[bar|Bar]]\n"},
|
||||
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nSecond.\n\n## Related Concepts\n\n- [[baz|Baz]]\n"},
|
||||
// LLM returns same title twice (simulates multi-chunk duplicate)
|
||||
llmResponse := mustJSON([]RawPage{
|
||||
{Title: "Foo", Type: "concept", Content: "## Definition\n\nFirst.\n\n## Related Concepts\n\n[[Bar]]\n"},
|
||||
{Title: "Foo", Type: "concept", Content: "## Definition\n\nSecond.\n\n## Related Concepts\n\n[[Baz]]\n"},
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -123,8 +125,9 @@ func TestRun_MergesDuplicatePaths(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
// keep-first for Definition, union for Related Concepts
|
||||
assert.Contains(t, string(content), "First.")
|
||||
assert.Contains(t, string(content), "[[bar|Bar]]")
|
||||
assert.Contains(t, string(content), "[[baz|Baz]]")
|
||||
// Bar and Baz unknown in empty inventory → left as plain [[links]]
|
||||
assert.Contains(t, string(content), "[[Bar]]")
|
||||
assert.Contains(t, string(content), "[[Baz]]")
|
||||
}
|
||||
|
||||
func mustJSON(v any) string {
|
||||
|
||||
Reference in New Issue
Block a user