107 lines
2.7 KiB
Go
107 lines
2.7 KiB
Go
// ingestion/internal/pipeline/build.go
|
|
package pipeline
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
|
)
|
|
|
|
// BuildPages converts RawPages from the LLM into wiki.Pages with computed slugs,
|
|
// paths, and YAML frontmatter. sourceSlug is the slug of the source being ingested
|
|
// (derived from the filename, not the LLM title). Pages whose title resolves to an
|
|
// empty slug are skipped and returned as warnings instead.
|
|
func BuildPages(rawPages []RawPage, sourceSlug, date string) ([]wiki.Page, []string) {
|
|
out := make([]wiki.Page, 0, len(rawPages))
|
|
var warnings []string
|
|
for _, rp := range rawPages {
|
|
slug := computeSlug(rp, sourceSlug)
|
|
if slug == "" {
|
|
warnings = append(warnings, fmt.Sprintf("skipped page with empty title (type: %s)", rp.Type))
|
|
continue
|
|
}
|
|
out = append(out, buildPage(rp, sourceSlug, date))
|
|
}
|
|
return out, warnings
|
|
}
|
|
|
|
func computeSlug(rp RawPage, sourceSlug string) string {
|
|
if rp.Type == "source" {
|
|
return sourceSlug
|
|
}
|
|
return wiki.Slug(rp.Title)
|
|
}
|
|
|
|
func buildPage(rp RawPage, sourceSlug, date string) wiki.Page {
|
|
var slug, dir string
|
|
switch rp.Type {
|
|
case "source":
|
|
slug = sourceSlug
|
|
dir = "wiki/sources"
|
|
case "concept":
|
|
slug = wiki.Slug(rp.Title)
|
|
dir = "wiki/concepts"
|
|
case "entity":
|
|
slug = wiki.Slug(rp.Title)
|
|
dir = "wiki/entities"
|
|
default:
|
|
slug = wiki.Slug(rp.Title)
|
|
dir = "wiki/" + rp.Type
|
|
}
|
|
|
|
path := dir + "/" + slug + ".md"
|
|
fm := buildFrontmatter(rp, date)
|
|
|
|
return wiki.Page{
|
|
Path: path,
|
|
Content: fm + "\n" + rp.Content,
|
|
}
|
|
}
|
|
|
|
func buildFrontmatter(rp RawPage, date string) string {
|
|
var sb strings.Builder
|
|
sb.WriteString("---\n")
|
|
fmt.Fprintf(&sb, "title: %s\n", yamlScalar(rp.Title))
|
|
|
|
switch rp.Type {
|
|
case "source":
|
|
subtype := rp.Subtype
|
|
if subtype == "" {
|
|
subtype = "article"
|
|
}
|
|
fmt.Fprintf(&sb, "type: %s\n", yamlScalar(subtype))
|
|
if rp.Domain != "" {
|
|
fmt.Fprintf(&sb, "domain: %s\n", yamlScalar(rp.Domain))
|
|
}
|
|
fmt.Fprintf(&sb, "date_ingested: %s\n", date)
|
|
fmt.Fprintf(&sb, "last_updated: %s\n", date)
|
|
case "concept":
|
|
if rp.Domain != "" {
|
|
fmt.Fprintf(&sb, "domain: %s\n", yamlScalar(rp.Domain))
|
|
}
|
|
fmt.Fprintf(&sb, "last_updated: %s\n", date)
|
|
case "entity":
|
|
if rp.Subtype != "" {
|
|
fmt.Fprintf(&sb, "type: %s\n", yamlScalar(rp.Subtype))
|
|
}
|
|
if rp.Domain != "" {
|
|
fmt.Fprintf(&sb, "domain: %s\n", yamlScalar(rp.Domain))
|
|
}
|
|
fmt.Fprintf(&sb, "last_updated: %s\n", date)
|
|
default:
|
|
if rp.Domain != "" {
|
|
fmt.Fprintf(&sb, "domain: %s\n", yamlScalar(rp.Domain))
|
|
}
|
|
fmt.Fprintf(&sb, "last_updated: %s\n", date)
|
|
}
|
|
|
|
fmt.Fprintf(&sb, "aliases:\n - %s\n", yamlScalar(rp.Title))
|
|
sb.WriteString("---\n")
|
|
return sb.String()
|
|
}
|
|
|
|
func yamlScalar(s string) string {
|
|
return "'" + strings.ReplaceAll(s, "'", "''") + "'"
|
|
}
|