feat(pipeline): add POST /ingest-raw for direct batch ingestion without LLM
Allows callers to provide pre-structured RawPage data directly, bypassing the LLM extraction step. The pipeline still handles slug computation, frontmatter, link canonicalization, source back-references, and dedup — only the extraction is skipped. Useful when a more capable model or manual curation produces the structured data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -59,11 +59,31 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
|
||||
allWarnings = append(allWarnings, warnings...)
|
||||
}
|
||||
|
||||
pages, buildWarnings := BuildPages(allRaw, sourceSlug, date)
|
||||
allWarnings = append(allWarnings, buildWarnings...)
|
||||
return buildAndWrite(allRaw, sourceSlug, date, brainDir, source, inventory, allWarnings, dryRun)
|
||||
}
|
||||
|
||||
// RunRaw runs the pipeline on pre-parsed RawPages, skipping the LLM extraction
|
||||
// step. Use this when the caller has already produced the structured RawPage data
|
||||
// (e.g. from a more capable model or manual curation).
|
||||
func RunRaw(brainDir, source string, rawPages []RawPage, dryRun bool) (Result, error) {
|
||||
inventory, err := wiki.LoadInventory(brainDir)
|
||||
if err != nil {
|
||||
return Result{}, fmt.Errorf("load inventory: %w", err)
|
||||
}
|
||||
|
||||
sourceSlug := wiki.Slug(source)
|
||||
date := time.Now().UTC().Format("2006-01-02")
|
||||
|
||||
return buildAndWrite(rawPages, sourceSlug, date, brainDir, source, inventory, nil, dryRun)
|
||||
}
|
||||
|
||||
// buildAndWrite runs BuildPages through write for both Run and RunRaw.
|
||||
func buildAndWrite(rawPages []RawPage, sourceSlug, date, brainDir, source string, inventory map[wiki.PageType][]wiki.Entry, warnings []string, dryRun bool) (Result, error) {
|
||||
pages, buildWarnings := BuildPages(rawPages, sourceSlug, date)
|
||||
warnings = append(warnings, buildWarnings...)
|
||||
resolved := Resolve(pages, inventory)
|
||||
canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory)
|
||||
allWarnings = append(allWarnings, linkWarnings...)
|
||||
warnings = append(warnings, linkWarnings...)
|
||||
withRefs := injectSourceRefs(canonicalized, inventory, brainDir)
|
||||
merged := mergeAll(withRefs)
|
||||
|
||||
@@ -83,14 +103,14 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
|
||||
|
||||
if !dryRun {
|
||||
if err := wiki.RebuildIndex(brainDir, date); err != nil {
|
||||
allWarnings = append(allWarnings, fmt.Sprintf("rebuild index: %v", err))
|
||||
warnings = append(warnings, fmt.Sprintf("rebuild index: %v", err))
|
||||
}
|
||||
if err := wiki.AppendLog(brainDir, source, written, allWarnings, date); err != nil {
|
||||
allWarnings = append(allWarnings, fmt.Sprintf("append log: %v", err))
|
||||
if err := wiki.AppendLog(brainDir, source, written, warnings, date); err != nil {
|
||||
warnings = append(warnings, fmt.Sprintf("append log: %v", err))
|
||||
}
|
||||
}
|
||||
|
||||
return Result{Pages: written, Warnings: allWarnings}, nil
|
||||
return Result{Pages: written, Warnings: warnings}, nil
|
||||
}
|
||||
|
||||
// mergeAll deduplicates pages by path, merging content from later occurrences.
|
||||
|
||||
Reference in New Issue
Block a user