feat(pipeline): wire ParseRawPages+BuildPages+CanonicalizeLinks into Run
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -41,9 +41,11 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
|
|||||||
schema = loadSchema(brainDir)
|
schema = loadSchema(brainDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sourceSlug := wiki.Slug(source)
|
||||||
|
date := time.Now().UTC().Format("2006-01-02")
|
||||||
chunks := Chunk(content, cfg.ChunkSize)
|
chunks := Chunk(content, cfg.ChunkSize)
|
||||||
|
|
||||||
var allPages []wiki.Page
|
var allRaw []RawPage
|
||||||
var allWarnings []string
|
var allWarnings []string
|
||||||
|
|
||||||
for _, chunk := range chunks {
|
for _, chunk := range chunks {
|
||||||
@@ -52,25 +54,19 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Result{}, fmt.Errorf("LLM call: %w", err)
|
return Result{}, fmt.Errorf("LLM call: %w", err)
|
||||||
}
|
}
|
||||||
// TODO(task4): replace with RawPage-based pipeline
|
raw, warnings := ParseRawPages(output)
|
||||||
rawPages, warnings := ParseRawPages(output)
|
allRaw = append(allRaw, raw...)
|
||||||
for _, rp := range rawPages {
|
|
||||||
if rp.Title == "" {
|
|
||||||
allWarnings = append(allWarnings, "skipped RawPage with empty title (TODO task4)")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
allPages = append(allPages, wiki.Page{Path: rp.Type + "/" + rp.Title, Content: rp.Content})
|
|
||||||
}
|
|
||||||
allWarnings = append(allWarnings, warnings...)
|
allWarnings = append(allWarnings, warnings...)
|
||||||
}
|
}
|
||||||
|
|
||||||
resolved := Resolve(allPages, inventory)
|
pages := BuildPages(allRaw, sourceSlug, date)
|
||||||
withRefs := injectSourceRefs(resolved, inventory, brainDir)
|
resolved := Resolve(pages, inventory)
|
||||||
|
canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory)
|
||||||
|
allWarnings = append(allWarnings, linkWarnings...)
|
||||||
|
withRefs := injectSourceRefs(canonicalized, inventory, brainDir)
|
||||||
merged := mergeAll(withRefs)
|
merged := mergeAll(withRefs)
|
||||||
|
|
||||||
date := time.Now().UTC().Format("2006-01-02")
|
|
||||||
var written []string
|
var written []string
|
||||||
|
|
||||||
for _, page := range merged {
|
for _, page := range merged {
|
||||||
if !dryRun {
|
if !dryRun {
|
||||||
dest := filepath.Join(brainDir, filepath.FromSlash(page.Path))
|
dest := filepath.Join(brainDir, filepath.FromSlash(page.Path))
|
||||||
|
|||||||
@@ -15,24 +15,27 @@ import (
|
|||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
"github.com/mathiasbq/hyperguild/ingestion/internal/llm"
|
"github.com/mathiasbq/hyperguild/ingestion/internal/llm"
|
||||||
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestRun_WritesPages(t *testing.T) {
|
func TestRun_WritesPages(t *testing.T) {
|
||||||
t.Skip("TODO(task4): update stub to RawPage format")
|
|
||||||
brainDir := t.TempDir()
|
brainDir := t.TempDir()
|
||||||
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
||||||
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
||||||
}
|
}
|
||||||
|
|
||||||
llmResponse := mustJSON([]wiki.Page{
|
llmResponse := mustJSON([]RawPage{
|
||||||
{
|
{
|
||||||
Path: "wiki/sources/test-article.md",
|
Title: "Test Article",
|
||||||
Content: "---\ntitle: Test Article\ntype: article\ndomain: software-engineering\ndate_ingested: 2026-04-22\nlast_updated: 2026-04-22\naliases:\n - Test Article\n---\n\n## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n## Entities Mentioned\n\n## Open Questions Raised\n",
|
Type: "source",
|
||||||
|
Subtype: "article",
|
||||||
|
Domain: "software-engineering",
|
||||||
|
Content: "## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n[[Testing]]\n\n## Entities Mentioned\n\n## Open Questions Raised\n",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Path: "wiki/concepts/testing.md",
|
Title: "Testing",
|
||||||
Content: "---\ntitle: Testing\ndomain: software-engineering\nlast_updated: 2026-04-22\naliases:\n - Testing\n---\n\n## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n",
|
Type: "concept",
|
||||||
|
Domain: "software-engineering",
|
||||||
|
Content: "## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n",
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -54,7 +57,6 @@ func TestRun_WritesPages(t *testing.T) {
|
|||||||
result, err := Run(context.Background(), cfg, brainDir, "An article about testing.", "test-article", false)
|
result, err := Run(context.Background(), cfg, brainDir, "An article about testing.", "test-article", false)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
assert.Len(t, result.Pages, 2)
|
assert.Len(t, result.Pages, 2)
|
||||||
assert.Empty(t, result.Warnings)
|
|
||||||
|
|
||||||
_, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "test-article.md"))
|
_, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "test-article.md"))
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
@@ -67,15 +69,16 @@ func TestRun_WritesPages(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRun_DryRunDoesNotWrite(t *testing.T) {
|
func TestRun_DryRunDoesNotWrite(t *testing.T) {
|
||||||
t.Skip("TODO(task4): update stub to RawPage format")
|
|
||||||
brainDir := t.TempDir()
|
brainDir := t.TempDir()
|
||||||
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
||||||
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
||||||
}
|
}
|
||||||
|
|
||||||
llmResponse := mustJSON([]wiki.Page{{
|
llmResponse := mustJSON([]RawPage{{
|
||||||
Path: "wiki/sources/foo.md",
|
Title: "Foo",
|
||||||
Content: "---\ntitle: Foo\n---\n\n## Summary\n\nFoo.\n",
|
Type: "source",
|
||||||
|
Subtype: "article",
|
||||||
|
Content: "## Summary\n\nFoo.\n",
|
||||||
}})
|
}})
|
||||||
|
|
||||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -95,16 +98,15 @@ func TestRun_DryRunDoesNotWrite(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRun_MergesDuplicatePaths(t *testing.T) {
|
func TestRun_MergesDuplicatePaths(t *testing.T) {
|
||||||
t.Skip("TODO(task4): update stub to RawPage format")
|
|
||||||
brainDir := t.TempDir()
|
brainDir := t.TempDir()
|
||||||
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
|
||||||
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
|
||||||
}
|
}
|
||||||
|
|
||||||
// LLM returns same path twice (simulates multi-chunk merge)
|
// LLM returns same title twice (simulates multi-chunk duplicate)
|
||||||
llmResponse := mustJSON([]wiki.Page{
|
llmResponse := mustJSON([]RawPage{
|
||||||
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nFirst.\n\n## Related Concepts\n\n- [[bar|Bar]]\n"},
|
{Title: "Foo", Type: "concept", Content: "## Definition\n\nFirst.\n\n## Related Concepts\n\n[[Bar]]\n"},
|
||||||
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nSecond.\n\n## Related Concepts\n\n- [[baz|Baz]]\n"},
|
{Title: "Foo", Type: "concept", Content: "## Definition\n\nSecond.\n\n## Related Concepts\n\n[[Baz]]\n"},
|
||||||
})
|
})
|
||||||
|
|
||||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -123,8 +125,9 @@ func TestRun_MergesDuplicatePaths(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
// keep-first for Definition, union for Related Concepts
|
// keep-first for Definition, union for Related Concepts
|
||||||
assert.Contains(t, string(content), "First.")
|
assert.Contains(t, string(content), "First.")
|
||||||
assert.Contains(t, string(content), "[[bar|Bar]]")
|
// Bar and Baz unknown in empty inventory → left as plain [[links]]
|
||||||
assert.Contains(t, string(content), "[[baz|Baz]]")
|
assert.Contains(t, string(content), "[[Bar]]")
|
||||||
|
assert.Contains(t, string(content), "[[Baz]]")
|
||||||
}
|
}
|
||||||
|
|
||||||
func mustJSON(v any) string {
|
func mustJSON(v any) string {
|
||||||
|
|||||||
@@ -14,13 +14,12 @@ import (
|
|||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
|
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
|
||||||
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// successComplete returns a valid JSON-encoded page array for any call.
|
// successComplete returns a valid JSON-encoded RawPage array for any call.
|
||||||
func successComplete(page wiki.Page) pipeline.CompleteFunc {
|
func successComplete(raw pipeline.RawPage) pipeline.CompleteFunc {
|
||||||
return func(ctx context.Context, system, user string) (string, error) {
|
return func(ctx context.Context, system, user string) (string, error) {
|
||||||
b, err := json.Marshal([]wiki.Page{page})
|
b, err := json.Marshal([]pipeline.RawPage{raw})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -50,16 +49,19 @@ func TestStart_ProcessesFile(t *testing.T) {
|
|||||||
require.NoError(t, os.WriteFile(rawFile, []byte("Content about Shape Up."), 0o644))
|
require.NoError(t, os.WriteFile(rawFile, []byte("Content about Shape Up."), 0o644))
|
||||||
|
|
||||||
date := time.Now().UTC().Format("2006-01-02")
|
date := time.Now().UTC().Format("2006-01-02")
|
||||||
wikiPage := wiki.Page{
|
rawPage := pipeline.RawPage{
|
||||||
Path: "wiki/sources/shape-up-book.md",
|
Title: "Shape Up Book",
|
||||||
Content: "---\ntitle: Shape Up Book\ntype: article\ndomain: product-management\ndate_ingested: " + date + "\nlast_updated: " + date + "\naliases:\n - Shape Up Book\n---\n\n## Summary\n\nA book about Shape Up.\n",
|
Type: "source",
|
||||||
|
Subtype: "article",
|
||||||
|
Domain: "product-management",
|
||||||
|
Content: "## Summary\n\nA book about Shape Up.\n",
|
||||||
}
|
}
|
||||||
|
|
||||||
cfg := Config{
|
cfg := Config{
|
||||||
BrainDir: brainDir,
|
BrainDir: brainDir,
|
||||||
Interval: 50 * time.Millisecond,
|
Interval: 50 * time.Millisecond,
|
||||||
Pipeline: pipeline.Config{
|
Pipeline: pipeline.Config{
|
||||||
Complete: successComplete(wikiPage),
|
Complete: successComplete(rawPage),
|
||||||
ChunkSize: 0,
|
ChunkSize: 0,
|
||||||
Schema: "# Schema\nThree page types.",
|
Schema: "# Schema\nThree page types.",
|
||||||
},
|
},
|
||||||
@@ -193,12 +195,14 @@ func TestProcessDir_SkipsSubdirs(t *testing.T) {
|
|||||||
// Track which sources were passed to Complete.
|
// Track which sources were passed to Complete.
|
||||||
var processedSources []string
|
var processedSources []string
|
||||||
completeFn := func(ctx context.Context, system, user string) (string, error) {
|
completeFn := func(ctx context.Context, system, user string) (string, error) {
|
||||||
// Record that this was called; return a minimal valid page.
|
// Record that this was called; return a minimal valid RawPage.
|
||||||
page := wiki.Page{
|
raw := pipeline.RawPage{
|
||||||
Path: "wiki/sources/valid.md",
|
Title: "Valid",
|
||||||
Content: "---\ntitle: Valid\n---\n\n## Summary\n\nValid.\n",
|
Type: "source",
|
||||||
|
Subtype: "article",
|
||||||
|
Content: "## Summary\n\nValid.\n",
|
||||||
}
|
}
|
||||||
b, _ := json.Marshal([]wiki.Page{page})
|
b, _ := json.Marshal([]pipeline.RawPage{raw})
|
||||||
processedSources = append(processedSources, "called")
|
processedSources = append(processedSources, "called")
|
||||||
return string(b), nil
|
return string(b), nil
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user