From 820d1c93a7c5723899ad393c3c08ded578b7d78e Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 15:44:13 +0200 Subject: [PATCH 1/8] docs: add implementation plan for PDF extraction and entity resolution --- .../2026-04-22-brain-ingestion-quality.md | 858 ++++++++++++++++++ 1 file changed, 858 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-22-brain-ingestion-quality.md diff --git a/docs/superpowers/plans/2026-04-22-brain-ingestion-quality.md b/docs/superpowers/plans/2026-04-22-brain-ingestion-quality.md new file mode 100644 index 0000000..8d0b27b --- /dev/null +++ b/docs/superpowers/plans/2026-04-22-brain-ingestion-quality.md @@ -0,0 +1,858 @@ +# Brain Ingestion Quality: PDF Extraction + Entity Resolution + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Fix PDF ingestion (currently passes raw bytes to LLM) and add fuzzy entity resolution (prevents slug proliferation at scale). + +**Architecture:** Two independent improvements wired into the existing pipeline. A new `extract` package handles text extraction by file type (pdftotext subprocess, passthrough for .md/.txt). A new `resolve.go` in the `pipeline` package normalizes proposed entity/concept titles against the loaded inventory to reuse existing slugs instead of creating duplicates. Both changes are wired into `watcher.go` and `api/handler.go` with no new dependencies except `poppler-utils` in the Docker image. + +**Tech Stack:** Go stdlib (`os/exec`, `bufio`, `strings`), testify, poppler-utils (`pdftotext`) + +--- + +## File Structure + +**New files:** +- `ingestion/internal/extract/extract.go` — `Text(path string) (string, error)` dispatcher +- `ingestion/internal/extract/pdf.go` — `pdftotext` subprocess extraction +- `ingestion/internal/extract/extract_test.go` — table-driven tests for all paths +- `ingestion/internal/pipeline/resolve.go` — `Resolve(proposed []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) []wiki.Page` +- `ingestion/internal/pipeline/resolve_test.go` — table-driven tests + +**Modified files:** +- `ingestion/internal/wiki/types.go` — add `Aliases []string` to `Entry` +- `ingestion/internal/wiki/inventory.go` — `readFrontmatter` reads both title and aliases +- `ingestion/internal/wiki/inventory_test.go` — add alias coverage +- `ingestion/internal/pipeline/pipeline.go` — call `Resolve` after `ParsePages` +- `ingestion/internal/watcher/watcher.go` — call `extract.Text` instead of `os.ReadFile` +- `ingestion/internal/api/handler.go` — call `extract.Text` for path-based ingestion +- `ingestion/Dockerfile` — `apk add poppler-utils` + +--- + +### Task 1: `extract` package — Text() dispatcher with .md/.txt passthrough + +**Files:** +- Create: `ingestion/internal/extract/extract.go` +- Create: `ingestion/internal/extract/extract_test.go` + +- [ ] **Step 1: Write the failing test** + +```go +// ingestion/internal/extract/extract_test.go +package extract + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestText_Markdown(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "note.md") + require.NoError(t, os.WriteFile(path, []byte("# Hello\n\nWorld."), 0o644)) + + got, err := Text(path) + require.NoError(t, err) + assert.Equal(t, "# Hello\n\nWorld.", got) +} + +func TestText_Txt(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "note.txt") + require.NoError(t, os.WriteFile(path, []byte("plain text"), 0o644)) + + got, err := Text(path) + require.NoError(t, err) + assert.Equal(t, "plain text", got) +} + +func TestText_UnsupportedExtension(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "data.csv") + require.NoError(t, os.WriteFile(path, []byte("a,b,c"), 0o644)) + + _, err := Text(path) + assert.ErrorContains(t, err, "unsupported") +} +``` + +- [ ] **Step 2: Run to verify it fails** + +```bash +cd ingestion && go test ./internal/extract/... -v +``` +Expected: compile error — package does not exist yet. + +- [ ] **Step 3: Implement extract.go** + +```go +// ingestion/internal/extract/extract.go +package extract + +import ( + "fmt" + "os" + "strings" +) + +// Text reads the file at path and returns its plain-text content. +// Supported extensions: .md, .txt (passthrough), .pdf (via pdftotext). +func Text(path string) (string, error) { + ext := strings.ToLower(fileExt(path)) + switch ext { + case ".md", ".txt": + b, err := os.ReadFile(path) + if err != nil { + return "", fmt.Errorf("read %s: %w", path, err) + } + return string(b), nil + case ".pdf": + return extractPDF(path) + default: + return "", fmt.Errorf("unsupported file extension: %s", ext) + } +} + +// fileExt returns the file extension including the dot, lowercased. +func fileExt(path string) string { + for i := len(path) - 1; i >= 0; i-- { + if path[i] == '.' { + return path[i:] + } + if path[i] == '/' || path[i] == '\\' { + break + } + } + return "" +} +``` + +- [ ] **Step 4: Add pdf.go stub so it compiles** + +```go +// ingestion/internal/extract/pdf.go +package extract + +import "fmt" + +func extractPDF(_ string) (string, error) { + return "", fmt.Errorf("PDF extraction not implemented") +} +``` + +- [ ] **Step 5: Run tests to verify they pass** + +```bash +cd ingestion && go test ./internal/extract/... -v +``` +Expected: PASS — 3 tests passing. + +- [ ] **Step 6: Commit** + +```bash +cd ingestion && git add internal/extract/ +git commit -m "feat(extract): add Text() dispatcher with md/txt passthrough" +``` + +--- + +### Task 2: PDF extraction via pdftotext + +**Files:** +- Modify: `ingestion/internal/extract/pdf.go` +- Modify: `ingestion/internal/extract/extract_test.go` + +- [ ] **Step 1: Add PDF test (skip if pdftotext absent)** + +Append to `extract_test.go`: + +```go +func TestText_PDF(t *testing.T) { + if _, err := exec.LookPath("pdftotext"); err != nil { + t.Skip("pdftotext not available") + } + // Use a known PDF fixture; if none, create a minimal one via echo. + // The test verifies the round-trip: a PDF containing "Hello PDF" yields that string. + dir := t.TempDir() + pdfPath := filepath.Join(dir, "test.pdf") + + // Generate a minimal single-page PDF using a here-doc approach. + // This is a valid minimal PDF containing the text "Hello PDF". + minimalPDF := "%PDF-1.4\n1 0 obj<>endobj\n" + + "2 0 obj<>endobj\n" + + "3 0 obj<>>>>>>>endobj\n" + + "4 0 obj<>\nstream\nBT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\nendstream\nendobj\n" + + "xref\n0 5\n0000000000 65535 f\n0000000009 00000 n\n0000000058 00000 n\n0000000115 00000 n\n0000000310 00000 n\n" + + "trailer<>\nstartxref\n406\n%%EOF\n" + require.NoError(t, os.WriteFile(pdfPath, []byte(minimalPDF), 0o644)) + + got, err := Text(pdfPath) + require.NoError(t, err) + assert.Contains(t, got, "Hello PDF") +} +``` + +Add `"os/exec"` to imports in `extract_test.go`. + +- [ ] **Step 2: Run to verify it fails (or skips)** + +```bash +cd ingestion && go test ./internal/extract/... -v -run TestText_PDF +``` +Expected: SKIP (pdftotext not installed locally) or FAIL with "not implemented". + +- [ ] **Step 3: Implement pdf.go** + +```go +// ingestion/internal/extract/pdf.go +package extract + +import ( + "bytes" + "fmt" + "os/exec" + "strings" +) + +// extractPDF runs pdftotext on path and returns the extracted text. +// pdftotext must be installed (package: poppler-utils on Alpine/Debian, poppler on Homebrew). +func extractPDF(path string) (string, error) { + cmd := exec.Command("pdftotext", "-q", path, "-") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + errMsg := strings.TrimSpace(stderr.String()) + if errMsg == "" { + errMsg = err.Error() + } + return "", fmt.Errorf("pdftotext: %s", errMsg) + } + + return strings.TrimSpace(stdout.String()), nil +} +``` + +- [ ] **Step 4: Run all extract tests** + +```bash +cd ingestion && go test ./internal/extract/... -v +``` +Expected: PASS (PDF test skips if pdftotext absent, passes if present). + +- [ ] **Step 5: Commit** + +```bash +cd ingestion && git add internal/extract/pdf.go internal/extract/extract_test.go +git commit -m "feat(extract): implement PDF extraction via pdftotext" +``` + +--- + +### Task 3: `Entry.Aliases` + inventory reads aliases from frontmatter + +**Files:** +- Modify: `ingestion/internal/wiki/types.go` +- Modify: `ingestion/internal/wiki/inventory.go` +- Modify: `ingestion/internal/wiki/inventory_test.go` + +- [ ] **Step 1: Write failing test for alias loading** + +Add to `inventory_test.go`: + +```go +func TestLoadInventory_ReadsAliases(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "entities"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "concepts"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "sources"), 0o755)) + + require.NoError(t, os.WriteFile( + filepath.Join(dir, "wiki", "entities", "ryan-singer.md"), + []byte("---\ntitle: Ryan Singer\naliases:\n - Singer\n - R. Singer\n---\n\n## Description\n\nDesigner.\n"), + 0o644, + )) + + inv, err := LoadInventory(dir) + require.NoError(t, err) + + require.Len(t, inv[PageTypeEntity], 1) + e := inv[PageTypeEntity][0] + assert.Equal(t, "Ryan Singer", e.Title) + assert.Equal(t, []string{"Singer", "R. Singer"}, e.Aliases) +} +``` + +- [ ] **Step 2: Run to verify it fails** + +```bash +cd ingestion && go test ./internal/wiki/... -v -run TestLoadInventory_ReadsAliases +``` +Expected: compile error — `Entry` has no `Aliases` field. + +- [ ] **Step 3: Add Aliases to Entry in types.go** + +```go +// Entry is a summary of an existing wiki page used to build the inventory. +type Entry struct { + Slug string + Title string + Aliases []string + Type PageType +} +``` + +- [ ] **Step 4: Replace readTitle with readFrontmatter in inventory.go** + +Replace the `readTitle` function and its call site: + +```go +// readFrontmatter extracts title and aliases from YAML frontmatter. +// Falls back to slug for title and empty aliases on any error. +func readFrontmatter(path, fallbackSlug string) (title string, aliases []string) { + title = fallbackSlug + f, err := os.Open(path) + if err != nil { + return + } + defer f.Close() + + scanner := bufio.NewScanner(f) + inFM := false + inAliases := false + for scanner.Scan() { + line := scanner.Text() + if strings.TrimSpace(line) == "---" { + if !inFM { + inFM = true + continue + } + break // end of frontmatter + } + if !inFM { + continue + } + + // Detect alias list items (lines starting with " - "). + if inAliases { + trimmed := strings.TrimSpace(line) + if strings.HasPrefix(trimmed, "- ") { + aliases = append(aliases, strings.TrimPrefix(trimmed, "- ")) + continue + } + inAliases = false // end of alias block + } + + key, val, ok := strings.Cut(line, ":") + if !ok { + continue + } + switch strings.TrimSpace(key) { + case "title": + title = strings.Trim(strings.TrimSpace(val), `"'`) + case "aliases": + inAliases = true + } + } + return +} +``` + +Update `LoadInventory` to use `readFrontmatter`: + +```go +title, aliases := readFrontmatter(path, slug) +result[pt] = append(result[pt], Entry{Slug: slug, Title: title, Aliases: aliases, Type: pt}) +``` + +Remove the old `readTitle` function entirely. + +- [ ] **Step 5: Run all wiki tests** + +```bash +cd ingestion && go test ./internal/wiki/... -v +``` +Expected: PASS — all existing tests plus new alias test. + +- [ ] **Step 6: Commit** + +```bash +cd ingestion && git add internal/wiki/types.go internal/wiki/inventory.go internal/wiki/inventory_test.go +git commit -m "feat(wiki): add Aliases to Entry and read from YAML frontmatter" +``` + +--- + +### Task 4: Fuzzy entity resolution + +**Files:** +- Create: `ingestion/internal/pipeline/resolve.go` +- Create: `ingestion/internal/pipeline/resolve_test.go` + +- [ ] **Step 1: Write failing tests** + +```go +// ingestion/internal/pipeline/resolve_test.go +package pipeline + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +func TestResolve_NoMatch(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/entities/new-person.md", Content: "---\ntitle: New Person\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: []string{"Singer"}}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/entities/new-person.md", got[0].Path) +} + +func TestResolve_TitleMatchRedirectsSlug(t *testing.T) { + // Proposed slug differs from existing but title matches. + proposed := []wiki.Page{ + {Path: "wiki/entities/ryan-singer-the-designer.md", Content: "---\ntitle: Ryan Singer\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: nil}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/entities/ryan-singer.md", got[0].Path) +} + +func TestResolve_AliasMatchRedirectsSlug(t *testing.T) { + // Proposed title matches an existing alias. + proposed := []wiki.Page{ + {Path: "wiki/entities/singer.md", Content: "---\ntitle: Singer\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: []string{"Singer", "R. Singer"}}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/entities/ryan-singer.md", got[0].Path) +} + +func TestResolve_NormalizationCaseAndArticles(t *testing.T) { + // "the shape up method" normalizes to "shape up method" which matches "Shape Up Method". + proposed := []wiki.Page{ + {Path: "wiki/concepts/the-shape-up-method.md", Content: "---\ntitle: The Shape Up Method\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "shape-up-method", Title: "Shape Up Method", Aliases: nil}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/concepts/shape-up-method.md", got[0].Path) +} + +func TestResolve_OnlyMatchesSamePageType(t *testing.T) { + // A concept slug must not redirect to an entity with the same normalized name. + proposed := []wiki.Page{ + {Path: "wiki/concepts/ryan-singer.md", Content: "---\ntitle: Ryan Singer\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: nil}, + }, + wiki.PageTypeConcept: {}, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + // Not redirected — different page type. + assert.Equal(t, "wiki/concepts/ryan-singer.md", got[0].Path) +} + +func TestResolve_EmptyInventory(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/entities/first.md", Content: "---\ntitle: First\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{} + got := Resolve(proposed, inventory) + assert.Equal(t, proposed, got) +} +``` + +- [ ] **Step 2: Run to verify it fails** + +```bash +cd ingestion && go test ./internal/pipeline/... -v -run TestResolve +``` +Expected: compile error — `Resolve` not defined. + +- [ ] **Step 3: Implement resolve.go** + +```go +// ingestion/internal/pipeline/resolve.go +package pipeline + +import ( + "path/filepath" + "strings" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +// Resolve remaps proposed pages to existing slugs when a fuzzy title match is found. +// It only matches within the same page type (entities→entities, concepts→concepts). +// Pages with no inventory match are returned unchanged. +func Resolve(proposed []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) []wiki.Page { + // Build normalized lookup: normalized_title → canonical slug, keyed by page type. + type key struct { + pt wiki.PageType + normalized string + } + lookup := make(map[key]string) // key → canonical slug + for pt, entries := range inventory { + for _, e := range entries { + k := key{pt: pt, normalized: normalizeTitle(e.Title)} + lookup[k] = e.Slug + for _, alias := range e.Aliases { + ak := key{pt: pt, normalized: normalizeTitle(alias)} + if _, exists := lookup[ak]; !exists { + lookup[ak] = e.Slug + } + } + } + } + + out := make([]wiki.Page, 0, len(proposed)) + for _, page := range proposed { + pt := pageTypeFromPath(page.Path) + title := extractTitle(page.Content) + k := key{pt: pt, normalized: normalizeTitle(title)} + if canonicalSlug, ok := lookup[k]; ok { + // Redirect path to canonical slug. + dir := filepath.Dir(page.Path) + page.Path = dir + "/" + canonicalSlug + ".md" + } + out = append(out, page) + } + return out +} + +// normalizeTitle lowercases, removes leading articles, collapses whitespace. +// "The Shape Up Method" → "shape up method" +func normalizeTitle(s string) string { + s = strings.ToLower(strings.TrimSpace(s)) + // Strip leading articles. + for _, article := range []string{"the ", "a ", "an "} { + s = strings.TrimPrefix(s, article) + } + // Collapse internal whitespace and replace hyphens. + s = strings.ReplaceAll(s, "-", " ") + return strings.Join(strings.Fields(s), " ") +} + +// pageTypeFromPath extracts the wiki.PageType from a path like "wiki/entities/foo.md". +func pageTypeFromPath(path string) wiki.PageType { + parts := strings.Split(filepath.ToSlash(path), "/") + if len(parts) >= 2 { + return wiki.PageType(parts[1]) + } + return "" +} + +// extractTitle reads the title field from YAML frontmatter in content. +// Falls back to empty string if not found. +func extractTitle(content string) string { + lines := strings.SplitN(content, "\n", 30) + inFM := false + for _, line := range lines { + if strings.TrimSpace(line) == "---" { + if !inFM { + inFM = true + continue + } + break + } + if inFM { + key, val, ok := strings.Cut(line, ":") + if ok && strings.TrimSpace(key) == "title" { + return strings.Trim(strings.TrimSpace(val), `"'`) + } + } + } + return "" +} +``` + +- [ ] **Step 4: Run resolve tests** + +```bash +cd ingestion && go test ./internal/pipeline/... -v -run TestResolve +``` +Expected: PASS — 6 tests passing. + +- [ ] **Step 5: Commit** + +```bash +cd ingestion && git add internal/pipeline/resolve.go internal/pipeline/resolve_test.go +git commit -m "feat(pipeline): add fuzzy entity resolution to prevent slug proliferation" +``` + +--- + +### Task 5: Wire Resolve into pipeline.Run + +**Files:** +- Modify: `ingestion/internal/pipeline/pipeline.go` + +- [ ] **Step 1: Add Resolve call after ParsePages in Run()** + +In `pipeline.go`, locate the loop that builds `allPages`. After `allPages = append(allPages, pages...)`, we have all pages from all chunks. Resolve must run after all chunks are merged, against the snapshot inventory loaded at the start of the run. + +Replace the `merged := mergeAll(allPages)` line with: + +```go +resolved := Resolve(allPages, inventory) +merged := mergeAll(resolved) +``` + +The full relevant section of `Run` after this change: + +```go +for _, chunk := range chunks { + userPrompt := BuildPrompt(schema, source, chunk, inventory) + output, err := cfg.Complete(ctx, systemPrompt, userPrompt) + if err != nil { + return Result{}, fmt.Errorf("LLM call: %w", err) + } + pages, warnings := ParsePages(output) + allPages = append(allPages, pages...) + allWarnings = append(allWarnings, warnings...) +} + +resolved := Resolve(allPages, inventory) +merged := mergeAll(resolved) +``` + +- [ ] **Step 2: Run all pipeline tests** + +```bash +cd ingestion && go test ./internal/pipeline/... -v +``` +Expected: PASS — all existing tests still pass (Resolve is a no-op when inventory is empty or no title matches). + +- [ ] **Step 3: Commit** + +```bash +cd ingestion && git add internal/pipeline/pipeline.go +git commit -m "feat(pipeline): resolve proposed pages against inventory before writing" +``` + +--- + +### Task 6: Wire extract.Text into watcher and handler + +**Files:** +- Modify: `ingestion/internal/watcher/watcher.go` +- Modify: `ingestion/internal/api/handler.go` + +- [ ] **Step 1: Update watcher.go** + +In `processFile`, replace: + +```go +content, err := os.ReadFile(path) +if err != nil { + return fmt.Errorf("read file: %w", err) +} + +_, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, string(content), source, false) +``` + +With: + +```go +content, err := extract.Text(path) +if err != nil { + return fmt.Errorf("extract text: %w", err) +} + +_, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, content, source, false) +``` + +Add import: `"github.com/mathiasbq/hyperguild/ingestion/internal/extract"` + +Remove import: `"os"` if no longer used (check — `os` is still used for `os.MkdirAll`, `os.WriteFile`, `os.Stat`; keep it). + +- [ ] **Step 2: Update handler.go — single-file path** + +In `IngestPath`, the single-file branch reads: + +```go +content, readErr := os.ReadFile(req.Path) +if readErr != nil { + writeError(w, http.StatusInternalServerError, fmt.Sprintf("read file: %v", readErr)) + return +} +``` + +Replace with: + +```go +content, readErr := extract.Text(req.Path) +if readErr != nil { + writeError(w, http.StatusInternalServerError, fmt.Sprintf("extract text: %v", readErr)) + return +} +``` + +- [ ] **Step 3: Update handler.go — directory walk branch** + +In `IngestPath`, the directory walk reads: + +```go +content, readErr := os.ReadFile(path) +if readErr != nil { + allWarnings = append(allWarnings, fmt.Sprintf("read %s: %v", path, readErr)) + return nil +} +source := req.Source +if source == "" { + source = filepath.Base(path) +} +result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun) +``` + +Replace with: + +```go +content, readErr := extract.Text(path) +if readErr != nil { + allWarnings = append(allWarnings, fmt.Sprintf("extract %s: %v", path, readErr)) + return nil +} +source := req.Source +if source == "" { + source = filepath.Base(path) +} +result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun) +``` + +Add import: `"github.com/mathiasbq/hyperguild/ingestion/internal/extract"` to handler.go. + +- [ ] **Step 4: Build to verify no compile errors** + +```bash +cd ingestion && go build ./... +``` +Expected: success, no errors. + +- [ ] **Step 5: Run all tests** + +```bash +cd ingestion && go test ./... +``` +Expected: PASS — all tests pass (watcher tests use .md files, already covered by extract passthrough). + +- [ ] **Step 6: Commit** + +```bash +cd ingestion && git add internal/watcher/watcher.go internal/api/handler.go +git commit -m "feat(watcher,api): use extract.Text() for file reading — fixes PDF ingestion" +``` + +--- + +### Task 7: Add poppler-utils to Dockerfile + +**Files:** +- Modify: `ingestion/Dockerfile` + +- [ ] **Step 1: Add apk install for poppler-utils** + +In `ingestion/Dockerfile`, add `poppler-utils` to the Alpine runtime stage. The current final stage is: + +```dockerfile +FROM alpine:3.21 + +COPY --from=builder /out/ingestion /usr/local/bin/ingestion + +RUN addgroup -S ingestion && adduser -S -G ingestion ingestion +``` + +Replace with: + +```dockerfile +FROM alpine:3.21 + +RUN apk add --no-cache poppler-utils + +COPY --from=builder /out/ingestion /usr/local/bin/ingestion + +RUN addgroup -S ingestion && adduser -S -G ingestion ingestion +``` + +- [ ] **Step 2: Verify Dockerfile builds (local Docker)** + +```bash +cd ingestion && docker build -t ingestion:test . +``` +Expected: image builds successfully; `pdftotext` is available inside. + +- [ ] **Step 3: Verify pdftotext is accessible in the image** + +```bash +docker run --rm ingestion:test pdftotext -v +``` +Expected: prints version string like `pdftotext version 24.x.x`. + +- [ ] **Step 4: Commit** + +```bash +cd ingestion && git add Dockerfile +git commit -m "chore(docker): add poppler-utils for PDF text extraction" +``` + +--- + +## Self-Review + +**Spec coverage check:** + +| Requirement | Task | +|---|---| +| PDF extraction via pdftotext | Tasks 2, 6, 7 | +| .md and .txt passthrough (no regression) | Task 1 | +| Unsupported extension → clear error | Task 1 | +| Entry.Aliases loaded from frontmatter | Task 3 | +| Fuzzy normalization (case, articles, hyphens) | Task 4 | +| Alias matching | Task 4 | +| Title matching across different proposed slugs | Task 4 | +| Cross-page-type isolation (concept ≠ entity) | Task 4 | +| Resolve wired into pipeline.Run | Task 5 | +| extract.Text wired into watcher | Task 6 | +| extract.Text wired into handler (single + dir) | Task 6 | +| Dockerfile includes poppler-utils | Task 7 | + +**Placeholder scan:** None found. + +**Type consistency:** +- `Resolve([]wiki.Page, map[wiki.PageType][]wiki.Entry) []wiki.Page` — consistent across Tasks 4 and 5. +- `extract.Text(path string) (string, error)` — consistent across Tasks 1, 2, and 6. +- `Entry.Aliases []string` — added in Task 3, used by Resolve in Task 4 (reads `e.Aliases`). +- `readFrontmatter` replaces `readTitle` entirely in Task 3 — no lingering `readTitle` calls. From 43a46d07e544b24590dd262739911b4f57627c83 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 15:45:20 +0200 Subject: [PATCH 2/8] feat(extract): add Text() dispatcher with md/txt passthrough --- ingestion/internal/extract/extract.go | 39 +++++++++++++++++++++ ingestion/internal/extract/extract_test.go | 40 ++++++++++++++++++++++ ingestion/internal/extract/pdf.go | 8 +++++ 3 files changed, 87 insertions(+) create mode 100644 ingestion/internal/extract/extract.go create mode 100644 ingestion/internal/extract/extract_test.go create mode 100644 ingestion/internal/extract/pdf.go diff --git a/ingestion/internal/extract/extract.go b/ingestion/internal/extract/extract.go new file mode 100644 index 0000000..725c85f --- /dev/null +++ b/ingestion/internal/extract/extract.go @@ -0,0 +1,39 @@ +// ingestion/internal/extract/extract.go +package extract + +import ( + "fmt" + "os" + "strings" +) + +// Text reads the file at path and returns its plain-text content. +// Supported extensions: .md, .txt (passthrough), .pdf (via pdftotext). +func Text(path string) (string, error) { + ext := strings.ToLower(fileExt(path)) + switch ext { + case ".md", ".txt": + b, err := os.ReadFile(path) + if err != nil { + return "", fmt.Errorf("read %s: %w", path, err) + } + return string(b), nil + case ".pdf": + return extractPDF(path) + default: + return "", fmt.Errorf("unsupported file extension: %s", ext) + } +} + +// fileExt returns the file extension including the dot, lowercased. +func fileExt(path string) string { + for i := len(path) - 1; i >= 0; i-- { + if path[i] == '.' { + return path[i:] + } + if path[i] == '/' || path[i] == '\\' { + break + } + } + return "" +} diff --git a/ingestion/internal/extract/extract_test.go b/ingestion/internal/extract/extract_test.go new file mode 100644 index 0000000..44cd5fb --- /dev/null +++ b/ingestion/internal/extract/extract_test.go @@ -0,0 +1,40 @@ +// ingestion/internal/extract/extract_test.go +package extract + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestText_Markdown(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "note.md") + require.NoError(t, os.WriteFile(path, []byte("# Hello\n\nWorld."), 0o644)) + + got, err := Text(path) + require.NoError(t, err) + assert.Equal(t, "# Hello\n\nWorld.", got) +} + +func TestText_Txt(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "note.txt") + require.NoError(t, os.WriteFile(path, []byte("plain text"), 0o644)) + + got, err := Text(path) + require.NoError(t, err) + assert.Equal(t, "plain text", got) +} + +func TestText_UnsupportedExtension(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "data.csv") + require.NoError(t, os.WriteFile(path, []byte("a,b,c"), 0o644)) + + _, err := Text(path) + assert.ErrorContains(t, err, "unsupported") +} diff --git a/ingestion/internal/extract/pdf.go b/ingestion/internal/extract/pdf.go new file mode 100644 index 0000000..4a073c4 --- /dev/null +++ b/ingestion/internal/extract/pdf.go @@ -0,0 +1,8 @@ +// ingestion/internal/extract/pdf.go +package extract + +import "fmt" + +func extractPDF(_ string) (string, error) { + return "", fmt.Errorf("PDF extraction not implemented") +} From 9cc6c2d053bb424a55c14e3940f6d966cd22c48a Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 15:53:46 +0200 Subject: [PATCH 3/8] feat(extract): implement PDF extraction via pdftotext --- ingestion/internal/extract/extract_test.go | 22 ++++++++++++++++++ ingestion/internal/extract/pdf.go | 26 +++++++++++++++++++--- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/ingestion/internal/extract/extract_test.go b/ingestion/internal/extract/extract_test.go index 44cd5fb..2ef75aa 100644 --- a/ingestion/internal/extract/extract_test.go +++ b/ingestion/internal/extract/extract_test.go @@ -3,6 +3,7 @@ package extract import ( "os" + "os/exec" "path/filepath" "testing" @@ -38,3 +39,24 @@ func TestText_UnsupportedExtension(t *testing.T) { _, err := Text(path) assert.ErrorContains(t, err, "unsupported") } + +func TestText_PDF(t *testing.T) { + if _, err := exec.LookPath("pdftotext"); err != nil { + t.Skip("pdftotext not available") + } + dir := t.TempDir() + pdfPath := filepath.Join(dir, "test.pdf") + + // Minimal valid PDF containing the text "Hello PDF". + minimalPDF := "%PDF-1.4\n1 0 obj<>endobj\n" + + "2 0 obj<>endobj\n" + + "3 0 obj<>>>>>>>endobj\n" + + "4 0 obj<>\nstream\nBT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\nendstream\nendobj\n" + + "xref\n0 5\n0000000000 65535 f\n0000000009 00000 n\n0000000058 00000 n\n0000000115 00000 n\n0000000310 00000 n\n" + + "trailer<>\nstartxref\n406\n%%EOF\n" + require.NoError(t, os.WriteFile(pdfPath, []byte(minimalPDF), 0o644)) + + got, err := Text(pdfPath) + require.NoError(t, err) + assert.Contains(t, got, "Hello PDF") +} diff --git a/ingestion/internal/extract/pdf.go b/ingestion/internal/extract/pdf.go index 4a073c4..8415a14 100644 --- a/ingestion/internal/extract/pdf.go +++ b/ingestion/internal/extract/pdf.go @@ -1,8 +1,28 @@ // ingestion/internal/extract/pdf.go package extract -import "fmt" +import ( + "bytes" + "fmt" + "os/exec" + "strings" +) -func extractPDF(_ string) (string, error) { - return "", fmt.Errorf("PDF extraction not implemented") +// extractPDF runs pdftotext on path and returns the extracted text. +// pdftotext must be installed (package: poppler-utils on Alpine/Debian, poppler on Homebrew). +func extractPDF(path string) (string, error) { + cmd := exec.Command("pdftotext", "-q", path, "-") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + errMsg := strings.TrimSpace(stderr.String()) + if errMsg == "" { + errMsg = err.Error() + } + return "", fmt.Errorf("pdftotext: %s", errMsg) + } + + return strings.TrimSpace(stdout.String()), nil } From bf6f497d9d09af7c404a3b714afce7b6fb55fb06 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 15:57:16 +0200 Subject: [PATCH 4/8] feat(wiki): add Aliases to Entry and read from YAML frontmatter --- ingestion/internal/wiki/inventory.go | 43 +++++++++++++++++------ ingestion/internal/wiki/inventory_test.go | 21 +++++++++++ ingestion/internal/wiki/slug.go | 2 +- ingestion/internal/wiki/types.go | 7 ++-- 4 files changed, 58 insertions(+), 15 deletions(-) diff --git a/ingestion/internal/wiki/inventory.go b/ingestion/internal/wiki/inventory.go index 43a4ad1..d887d48 100644 --- a/ingestion/internal/wiki/inventory.go +++ b/ingestion/internal/wiki/inventory.go @@ -32,23 +32,26 @@ func LoadInventory(brainDir string) (map[PageType][]Entry, error) { } slug := strings.TrimSuffix(e.Name(), ".md") path := filepath.Join(dir, e.Name()) - title := readTitle(path, slug) - result[pt] = append(result[pt], Entry{Slug: slug, Title: title, Type: pt}) + title, aliases := readFrontmatter(path, slug) + result[pt] = append(result[pt], Entry{Slug: slug, Title: title, Aliases: aliases, Type: pt}) } } return result, nil } -// readTitle extracts the title from YAML frontmatter, falling back to slug. -func readTitle(path, fallback string) string { +// readFrontmatter extracts title and aliases from YAML frontmatter. +// Falls back to slug for title and empty aliases on any error. +func readFrontmatter(path, fallbackSlug string) (title string, aliases []string) { + title = fallbackSlug f, err := os.Open(path) if err != nil { - return fallback + return } defer f.Close() scanner := bufio.NewScanner(f) inFM := false + inAliases := false for scanner.Scan() { line := scanner.Text() if strings.TrimSpace(line) == "---" { @@ -56,14 +59,32 @@ func readTitle(path, fallback string) string { inFM = true continue } - break + break // end of frontmatter } - if inFM { - key, val, ok := strings.Cut(line, ":") - if ok && strings.TrimSpace(key) == "title" { - return strings.Trim(strings.TrimSpace(val), `"'`) + if !inFM { + continue + } + + // Detect alias list items (lines starting with " - "). + if inAliases { + trimmed := strings.TrimSpace(line) + if strings.HasPrefix(trimmed, "- ") { + aliases = append(aliases, strings.TrimPrefix(trimmed, "- ")) + continue } + inAliases = false // end of alias block + } + + key, val, ok := strings.Cut(line, ":") + if !ok { + continue + } + switch strings.TrimSpace(key) { + case "title": + title = strings.Trim(strings.TrimSpace(val), `"'`) + case "aliases": + inAliases = true } } - return fallback + return } diff --git a/ingestion/internal/wiki/inventory_test.go b/ingestion/internal/wiki/inventory_test.go index 7a485dc..7068fdb 100644 --- a/ingestion/internal/wiki/inventory_test.go +++ b/ingestion/internal/wiki/inventory_test.go @@ -60,3 +60,24 @@ func TestLoadInventory_MissingDirsOk(t *testing.T) { require.NoError(t, err) assert.NotNil(t, inv) } + +func TestLoadInventory_ReadsAliases(t *testing.T) { + dir := t.TempDir() + require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "entities"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "concepts"), 0o755)) + require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "sources"), 0o755)) + + require.NoError(t, os.WriteFile( + filepath.Join(dir, "wiki", "entities", "ryan-singer.md"), + []byte("---\ntitle: Ryan Singer\naliases:\n - Singer\n - R. Singer\n---\n\n## Description\n\nDesigner.\n"), + 0o644, + )) + + inv, err := LoadInventory(dir) + require.NoError(t, err) + + require.Len(t, inv[PageTypeEntity], 1) + e := inv[PageTypeEntity][0] + assert.Equal(t, "Ryan Singer", e.Title) + assert.Equal(t, []string{"Singer", "R. Singer"}, e.Aliases) +} diff --git a/ingestion/internal/wiki/slug.go b/ingestion/internal/wiki/slug.go index ab39b46..feb1681 100644 --- a/ingestion/internal/wiki/slug.go +++ b/ingestion/internal/wiki/slug.go @@ -21,7 +21,7 @@ func Slug(title string) string { case unicode.IsLetter(r) || unicode.IsDigit(r): b.WriteRune(r) prevHyphen = false - // all other characters (apostrophes, colons, dots, etc.) are dropped + // all other characters (apostrophes, colons, dots, etc.) are dropped } } return strings.TrimRight(b.String(), "-") diff --git a/ingestion/internal/wiki/types.go b/ingestion/internal/wiki/types.go index b555d91..62e74aa 100644 --- a/ingestion/internal/wiki/types.go +++ b/ingestion/internal/wiki/types.go @@ -18,7 +18,8 @@ type Page struct { // Entry is a summary of an existing wiki page used to build the inventory. type Entry struct { - Slug string - Title string - Type PageType + Slug string + Title string + Aliases []string + Type PageType } From e9b5cc401cbd64c850bf53459434dc7f63220df8 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 15:59:36 +0200 Subject: [PATCH 5/8] feat(pipeline): add fuzzy entity resolution to prevent slug proliferation --- ingestion/internal/pipeline/resolve.go | 88 ++++++++++++++++++++ ingestion/internal/pipeline/resolve_test.go | 90 +++++++++++++++++++++ 2 files changed, 178 insertions(+) create mode 100644 ingestion/internal/pipeline/resolve.go create mode 100644 ingestion/internal/pipeline/resolve_test.go diff --git a/ingestion/internal/pipeline/resolve.go b/ingestion/internal/pipeline/resolve.go new file mode 100644 index 0000000..df08249 --- /dev/null +++ b/ingestion/internal/pipeline/resolve.go @@ -0,0 +1,88 @@ +// ingestion/internal/pipeline/resolve.go +package pipeline + +import ( + "path/filepath" + "strings" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +// Resolve remaps proposed pages to existing slugs when a fuzzy title match is found. +// It only matches within the same page type (entities→entities, concepts→concepts). +// Pages with no inventory match are returned unchanged. +func Resolve(proposed []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) []wiki.Page { + type key struct { + pt wiki.PageType + normalized string + } + lookup := make(map[key]string) // key → canonical slug + for pt, entries := range inventory { + for _, e := range entries { + k := key{pt: pt, normalized: normalizeTitle(e.Title)} + lookup[k] = e.Slug + for _, alias := range e.Aliases { + ak := key{pt: pt, normalized: normalizeTitle(alias)} + if _, exists := lookup[ak]; !exists { + lookup[ak] = e.Slug + } + } + } + } + + out := make([]wiki.Page, 0, len(proposed)) + for _, page := range proposed { + pt := pageTypeFromPath(page.Path) + title := extractTitle(page.Content) + k := key{pt: pt, normalized: normalizeTitle(title)} + if canonicalSlug, ok := lookup[k]; ok { + dir := filepath.Dir(page.Path) + page.Path = dir + "/" + canonicalSlug + ".md" + } + out = append(out, page) + } + return out +} + +// normalizeTitle lowercases, removes leading articles, collapses whitespace. +// "The Shape Up Method" → "shape up method" +func normalizeTitle(s string) string { + s = strings.ToLower(strings.TrimSpace(s)) + for _, article := range []string{"the ", "a ", "an "} { + s = strings.TrimPrefix(s, article) + } + s = strings.ReplaceAll(s, "-", " ") + return strings.Join(strings.Fields(s), " ") +} + +// pageTypeFromPath extracts the wiki.PageType from a path like "wiki/entities/foo.md". +func pageTypeFromPath(path string) wiki.PageType { + parts := strings.Split(filepath.ToSlash(path), "/") + if len(parts) >= 2 { + return wiki.PageType(parts[1]) + } + return "" +} + +// extractTitle reads the title field from YAML frontmatter in content. +// Falls back to empty string if not found. +func extractTitle(content string) string { + lines := strings.SplitN(content, "\n", 30) + inFM := false + for _, line := range lines { + if strings.TrimSpace(line) == "---" { + if !inFM { + inFM = true + continue + } + break + } + if inFM { + key, val, ok := strings.Cut(line, ":") + if ok && strings.TrimSpace(key) == "title" { + return strings.Trim(strings.TrimSpace(val), `"'`) + } + } + } + return "" +} diff --git a/ingestion/internal/pipeline/resolve_test.go b/ingestion/internal/pipeline/resolve_test.go new file mode 100644 index 0000000..19b66e5 --- /dev/null +++ b/ingestion/internal/pipeline/resolve_test.go @@ -0,0 +1,90 @@ +// ingestion/internal/pipeline/resolve_test.go +package pipeline + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +func TestResolve_NoMatch(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/entities/new-person.md", Content: "---\ntitle: New Person\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: []string{"Singer"}}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/entities/new-person.md", got[0].Path) +} + +func TestResolve_TitleMatchRedirectsSlug(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/entities/ryan-singer-the-designer.md", Content: "---\ntitle: Ryan Singer\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: nil}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/entities/ryan-singer.md", got[0].Path) +} + +func TestResolve_AliasMatchRedirectsSlug(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/entities/singer.md", Content: "---\ntitle: Singer\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: []string{"Singer", "R. Singer"}}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/entities/ryan-singer.md", got[0].Path) +} + +func TestResolve_NormalizationCaseAndArticles(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/concepts/the-shape-up-method.md", Content: "---\ntitle: The Shape Up Method\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "shape-up-method", Title: "Shape Up Method", Aliases: nil}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/concepts/shape-up-method.md", got[0].Path) +} + +func TestResolve_OnlyMatchesSamePageType(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/concepts/ryan-singer.md", Content: "---\ntitle: Ryan Singer\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: nil}, + }, + wiki.PageTypeConcept: {}, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/concepts/ryan-singer.md", got[0].Path) +} + +func TestResolve_EmptyInventory(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/entities/first.md", Content: "---\ntitle: First\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{} + got := Resolve(proposed, inventory) + assert.Equal(t, proposed, got) +} From 53e46781b1fb6966eb0379b26dc12f7c75ebd3f4 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 16:00:31 +0200 Subject: [PATCH 6/8] feat(pipeline): resolve proposed pages against inventory before writing --- ingestion/internal/pipeline/pipeline.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ingestion/internal/pipeline/pipeline.go b/ingestion/internal/pipeline/pipeline.go index e4c4b97..1e6c924 100644 --- a/ingestion/internal/pipeline/pipeline.go +++ b/ingestion/internal/pipeline/pipeline.go @@ -57,7 +57,8 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR allWarnings = append(allWarnings, warnings...) } - merged := mergeAll(allPages) + resolved := Resolve(allPages, inventory) + merged := mergeAll(resolved) date := time.Now().UTC().Format("2006-01-02") var written []string From 2975eadc87c7b6661659838fa150d0411b11a5dd Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 16:01:36 +0200 Subject: [PATCH 7/8] =?UTF-8?q?feat(watcher,api):=20use=20extract.Text()?= =?UTF-8?q?=20for=20file=20reading=20=E2=80=94=20fixes=20PDF=20ingestion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ingestion/internal/api/handler.go | 13 +++++++------ ingestion/internal/watcher/watcher.go | 7 ++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/ingestion/internal/api/handler.go b/ingestion/internal/api/handler.go index 0637cde..141ca0a 100644 --- a/ingestion/internal/api/handler.go +++ b/ingestion/internal/api/handler.go @@ -11,6 +11,7 @@ import ( "strings" "time" + "github.com/mathiasbq/hyperguild/ingestion/internal/extract" "github.com/mathiasbq/hyperguild/ingestion/internal/pipeline" "github.com/mathiasbq/hyperguild/ingestion/internal/search" ) @@ -214,16 +215,16 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) { if !supportedExtensions[ext] { return nil } - content, readErr := os.ReadFile(path) + content, readErr := extract.Text(path) if readErr != nil { - allWarnings = append(allWarnings, fmt.Sprintf("read %s: %v", path, readErr)) + allWarnings = append(allWarnings, fmt.Sprintf("extract %s: %v", path, readErr)) return nil } source := req.Source if source == "" { source = filepath.Base(path) } - result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun) + result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun) if runErr != nil { allWarnings = append(allWarnings, fmt.Sprintf("ingest %s: %v", path, runErr)) return nil @@ -243,16 +244,16 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusBadRequest, fmt.Sprintf("unsupported file extension: %s", ext)) return } - content, readErr := os.ReadFile(req.Path) + content, readErr := extract.Text(req.Path) if readErr != nil { - writeError(w, http.StatusInternalServerError, fmt.Sprintf("read file: %v", readErr)) + writeError(w, http.StatusInternalServerError, fmt.Sprintf("extract text: %v", readErr)) return } source := req.Source if source == "" { source = filepath.Base(req.Path) } - result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun) + result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun) if runErr != nil { h.logger.Error("ingest-path failed", "path", req.Path, "err", runErr) writeError(w, http.StatusInternalServerError, "ingest error") diff --git a/ingestion/internal/watcher/watcher.go b/ingestion/internal/watcher/watcher.go index a65db79..2a1c0fb 100644 --- a/ingestion/internal/watcher/watcher.go +++ b/ingestion/internal/watcher/watcher.go @@ -11,6 +11,7 @@ import ( "time" "unicode" + "github.com/mathiasbq/hyperguild/ingestion/internal/extract" "github.com/mathiasbq/hyperguild/ingestion/internal/pipeline" ) @@ -88,12 +89,12 @@ func processFile(ctx context.Context, cfg Config, path, date string) error { filename := filepath.Base(path) source := deriveSource(filename) - content, err := os.ReadFile(path) + content, err := extract.Text(path) if err != nil { - return fmt.Errorf("read file: %w", err) + return fmt.Errorf("extract text: %w", err) } - _, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, string(content), source, false) + _, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, content, source, false) if runErr != nil { // Move to failed/. failedDir := filepath.Join(cfg.BrainDir, "raw", "failed") From a37d18bf7aac8c25dfafcae23c6559811e50bee1 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 16:02:12 +0200 Subject: [PATCH 8/8] chore(docker): add poppler-utils for PDF text extraction --- ingestion/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ingestion/Dockerfile b/ingestion/Dockerfile index 4e9ebc5..bf91361 100644 --- a/ingestion/Dockerfile +++ b/ingestion/Dockerfile @@ -15,6 +15,8 @@ RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 \ FROM alpine:3.21 +RUN apk add --no-cache poppler-utils + COPY --from=builder /out/ingestion /usr/local/bin/ingestion RUN addgroup -S ingestion && adduser -S -G ingestion ingestion