feat: PDF extraction and fuzzy entity resolution

- New extract package: Text() dispatcher for .md/.txt passthrough and PDF extraction via pdftotext subprocess - wiki.Entry gains Aliases []string, loaded from YAML frontmatter - Fuzzy entity resolution in pipeline: normalizes titles (lowercase, strip articles, collapse hyphens) and matches proposed pages against existing inventory slugs and aliases to prevent proliferation - Watcher and API handler now use extract.Text() instead of os.ReadFile - Dockerfile: apk add poppler-utils in Alpine runtime stage
2026-04-23 16:03:02 +02:00
parent 6928907d79 a37d18bf7a
commit a6c39e8691
14 changed files with 1238 additions and 25 deletions
--- a/ingestion/internal/api/handler.go
+++ b/ingestion/internal/api/handler.go
@@ -11,6 +11,7 @@ import (
 	"strings"
 	"time"

+	"github.com/mathiasbq/hyperguild/ingestion/internal/extract"
 	"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
 	"github.com/mathiasbq/hyperguild/ingestion/internal/search"
 )
@@ -214,16 +215,16 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) {
 			if !supportedExtensions[ext] {
 				return nil
 			}
-			content, readErr := os.ReadFile(path)
+			content, readErr := extract.Text(path)
 			if readErr != nil {
-				allWarnings = append(allWarnings, fmt.Sprintf("read %s: %v", path, readErr))
+				allWarnings = append(allWarnings, fmt.Sprintf("extract %s: %v", path, readErr))
 				return nil
 			}
 			source := req.Source
 			if source == "" {
 				source = filepath.Base(path)
 			}
-			result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun)
+			result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun)
 			if runErr != nil {
 				allWarnings = append(allWarnings, fmt.Sprintf("ingest %s: %v", path, runErr))
 				return nil
@@ -243,16 +244,16 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) {
 			writeError(w, http.StatusBadRequest, fmt.Sprintf("unsupported file extension: %s", ext))
 			return
 		}
-		content, readErr := os.ReadFile(req.Path)
+		content, readErr := extract.Text(req.Path)
 		if readErr != nil {
-			writeError(w, http.StatusInternalServerError, fmt.Sprintf("read file: %v", readErr))
+			writeError(w, http.StatusInternalServerError, fmt.Sprintf("extract text: %v", readErr))
 			return
 		}
 		source := req.Source
 		if source == "" {
 			source = filepath.Base(req.Path)
 		}
-		result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun)
+		result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun)
 		if runErr != nil {
 			h.logger.Error("ingest-path failed", "path", req.Path, "err", runErr)
 			writeError(w, http.StatusInternalServerError, "ingest error")
--- a/ingestion/internal/extract/extract.go
+++ b/ingestion/internal/extract/extract.go
@@ -0,0 +1,39 @@
+// ingestion/internal/extract/extract.go
+package extract
+
+import (
+	"fmt"
+	"os"
+	"strings"
+)
+
+// Text reads the file at path and returns its plain-text content.
+// Supported extensions: .md, .txt (passthrough), .pdf (via pdftotext).
+func Text(path string) (string, error) {
+	ext := strings.ToLower(fileExt(path))
+	switch ext {
+	case ".md", ".txt":
+		b, err := os.ReadFile(path)
+		if err != nil {
+			return "", fmt.Errorf("read %s: %w", path, err)
+		}
+		return string(b), nil
+	case ".pdf":
+		return extractPDF(path)
+	default:
+		return "", fmt.Errorf("unsupported file extension: %s", ext)
+	}
+}
+
+// fileExt returns the file extension including the dot, lowercased.
+func fileExt(path string) string {
+	for i := len(path) - 1; i >= 0; i-- {
+		if path[i] == '.' {
+			return path[i:]
+		}
+		if path[i] == '/' || path[i] == '\\' {
+			break
+		}
+	}
+	return ""
+}
--- a/ingestion/internal/extract/extract_test.go
+++ b/ingestion/internal/extract/extract_test.go
@@ -0,0 +1,62 @@
+// ingestion/internal/extract/extract_test.go
+package extract
+
+import (
+	"os"
+	"os/exec"
+	"path/filepath"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestText_Markdown(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "note.md")
+	require.NoError(t, os.WriteFile(path, []byte("# Hello\n\nWorld."), 0o644))
+
+	got, err := Text(path)
+	require.NoError(t, err)
+	assert.Equal(t, "# Hello\n\nWorld.", got)
+}
+
+func TestText_Txt(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "note.txt")
+	require.NoError(t, os.WriteFile(path, []byte("plain text"), 0o644))
+
+	got, err := Text(path)
+	require.NoError(t, err)
+	assert.Equal(t, "plain text", got)
+}
+
+func TestText_UnsupportedExtension(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "data.csv")
+	require.NoError(t, os.WriteFile(path, []byte("a,b,c"), 0o644))
+
+	_, err := Text(path)
+	assert.ErrorContains(t, err, "unsupported")
+}
+
+func TestText_PDF(t *testing.T) {
+	if _, err := exec.LookPath("pdftotext"); err != nil {
+		t.Skip("pdftotext not available")
+	}
+	dir := t.TempDir()
+	pdfPath := filepath.Join(dir, "test.pdf")
+
+	// Minimal valid PDF containing the text "Hello PDF".
+	minimalPDF := "%PDF-1.4\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n" +
+		"2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n" +
+		"3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Contents 4 0 R/Resources<</Font<</F1<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>>>>>>>endobj\n" +
+		"4 0 obj<</Length 44>>\nstream\nBT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\nendstream\nendobj\n" +
+		"xref\n0 5\n0000000000 65535 f\n0000000009 00000 n\n0000000058 00000 n\n0000000115 00000 n\n0000000310 00000 n\n" +
+		"trailer<</Size 5/Root 1 0 R>>\nstartxref\n406\n%%EOF\n"
+	require.NoError(t, os.WriteFile(pdfPath, []byte(minimalPDF), 0o644))
+
+	got, err := Text(pdfPath)
+	require.NoError(t, err)
+	assert.Contains(t, got, "Hello PDF")
+}
--- a/ingestion/internal/extract/pdf.go
+++ b/ingestion/internal/extract/pdf.go
@@ -0,0 +1,28 @@
+// ingestion/internal/extract/pdf.go
+package extract
+
+import (
+	"bytes"
+	"fmt"
+	"os/exec"
+	"strings"
+)
+
+// extractPDF runs pdftotext on path and returns the extracted text.
+// pdftotext must be installed (package: poppler-utils on Alpine/Debian, poppler on Homebrew).
+func extractPDF(path string) (string, error) {
+	cmd := exec.Command("pdftotext", "-q", path, "-")
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		errMsg := strings.TrimSpace(stderr.String())
+		if errMsg == "" {
+			errMsg = err.Error()
+		}
+		return "", fmt.Errorf("pdftotext: %s", errMsg)
+	}
+
+	return strings.TrimSpace(stdout.String()), nil
+}
--- a/ingestion/internal/pipeline/pipeline.go
+++ b/ingestion/internal/pipeline/pipeline.go
@@ -57,7 +57,8 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
 		allWarnings = append(allWarnings, warnings...)
 	}

-	merged := mergeAll(allPages)
+	resolved := Resolve(allPages, inventory)
+	merged := mergeAll(resolved)

 	date := time.Now().UTC().Format("2006-01-02")
 	var written []string
--- a/ingestion/internal/pipeline/resolve.go
+++ b/ingestion/internal/pipeline/resolve.go
@@ -0,0 +1,88 @@
+// ingestion/internal/pipeline/resolve.go
+package pipeline
+
+import (
+	"path/filepath"
+	"strings"
+
+	"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
+)
+
+// Resolve remaps proposed pages to existing slugs when a fuzzy title match is found.
+// It only matches within the same page type (entities→entities, concepts→concepts).
+// Pages with no inventory match are returned unchanged.
+func Resolve(proposed []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) []wiki.Page {
+	type key struct {
+		pt         wiki.PageType
+		normalized string
+	}
+	lookup := make(map[key]string) // key → canonical slug
+	for pt, entries := range inventory {
+		for _, e := range entries {
+			k := key{pt: pt, normalized: normalizeTitle(e.Title)}
+			lookup[k] = e.Slug
+			for _, alias := range e.Aliases {
+				ak := key{pt: pt, normalized: normalizeTitle(alias)}
+				if _, exists := lookup[ak]; !exists {
+					lookup[ak] = e.Slug
+				}
+			}
+		}
+	}
+
+	out := make([]wiki.Page, 0, len(proposed))
+	for _, page := range proposed {
+		pt := pageTypeFromPath(page.Path)
+		title := extractTitle(page.Content)
+		k := key{pt: pt, normalized: normalizeTitle(title)}
+		if canonicalSlug, ok := lookup[k]; ok {
+			dir := filepath.Dir(page.Path)
+			page.Path = dir + "/" + canonicalSlug + ".md"
+		}
+		out = append(out, page)
+	}
+	return out
+}
+
+// normalizeTitle lowercases, removes leading articles, collapses whitespace.
+// "The Shape Up Method" → "shape up method"
+func normalizeTitle(s string) string {
+	s = strings.ToLower(strings.TrimSpace(s))
+	for _, article := range []string{"the ", "a ", "an "} {
+		s = strings.TrimPrefix(s, article)
+	}
+	s = strings.ReplaceAll(s, "-", " ")
+	return strings.Join(strings.Fields(s), " ")
+}
+
+// pageTypeFromPath extracts the wiki.PageType from a path like "wiki/entities/foo.md".
+func pageTypeFromPath(path string) wiki.PageType {
+	parts := strings.Split(filepath.ToSlash(path), "/")
+	if len(parts) >= 2 {
+		return wiki.PageType(parts[1])
+	}
+	return ""
+}
+
+// extractTitle reads the title field from YAML frontmatter in content.
+// Falls back to empty string if not found.
+func extractTitle(content string) string {
+	lines := strings.SplitN(content, "\n", 30)
+	inFM := false
+	for _, line := range lines {
+		if strings.TrimSpace(line) == "---" {
+			if !inFM {
+				inFM = true
+				continue
+			}
+			break
+		}
+		if inFM {
+			key, val, ok := strings.Cut(line, ":")
+			if ok && strings.TrimSpace(key) == "title" {
+				return strings.Trim(strings.TrimSpace(val), `"'`)
+			}
+		}
+	}
+	return ""
+}
--- a/ingestion/internal/pipeline/resolve_test.go
+++ b/ingestion/internal/pipeline/resolve_test.go
@@ -0,0 +1,90 @@
+// ingestion/internal/pipeline/resolve_test.go
+package pipeline
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+
+	"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
+)
+
+func TestResolve_NoMatch(t *testing.T) {
+	proposed := []wiki.Page{
+		{Path: "wiki/entities/new-person.md", Content: "---\ntitle: New Person\n---\n"},
+	}
+	inventory := map[wiki.PageType][]wiki.Entry{
+		wiki.PageTypeEntity: {
+			{Slug: "ryan-singer", Title: "Ryan Singer", Aliases: []string{"Singer"}},
+		},
+	}
+	got := Resolve(proposed, inventory)
+	assert.Len(t, got, 1)
+	assert.Equal(t, "wiki/entities/new-person.md", got[0].Path)
+}
+
+func TestResolve_TitleMatchRedirectsSlug(t *testing.T) {
+	proposed := []wiki.Page{
+		{Path: "wiki/entities/ryan-singer-the-designer.md", Content: "---\ntitle: Ryan Singer\n---\n"},
+	}
+	inventory := map[wiki.PageType][]wiki.Entry{
+		wiki.PageTypeEntity: {
+			{Slug: "ryan-singer", Title: "Ryan Singer", Aliases: nil},
+		},
+	}
+	got := Resolve(proposed, inventory)
+	assert.Len(t, got, 1)
+	assert.Equal(t, "wiki/entities/ryan-singer.md", got[0].Path)
+}
+
+func TestResolve_AliasMatchRedirectsSlug(t *testing.T) {
+	proposed := []wiki.Page{
+		{Path: "wiki/entities/singer.md", Content: "---\ntitle: Singer\n---\n"},
+	}
+	inventory := map[wiki.PageType][]wiki.Entry{
+		wiki.PageTypeEntity: {
+			{Slug: "ryan-singer", Title: "Ryan Singer", Aliases: []string{"Singer", "R. Singer"}},
+		},
+	}
+	got := Resolve(proposed, inventory)
+	assert.Len(t, got, 1)
+	assert.Equal(t, "wiki/entities/ryan-singer.md", got[0].Path)
+}
+
+func TestResolve_NormalizationCaseAndArticles(t *testing.T) {
+	proposed := []wiki.Page{
+		{Path: "wiki/concepts/the-shape-up-method.md", Content: "---\ntitle: The Shape Up Method\n---\n"},
+	}
+	inventory := map[wiki.PageType][]wiki.Entry{
+		wiki.PageTypeConcept: {
+			{Slug: "shape-up-method", Title: "Shape Up Method", Aliases: nil},
+		},
+	}
+	got := Resolve(proposed, inventory)
+	assert.Len(t, got, 1)
+	assert.Equal(t, "wiki/concepts/shape-up-method.md", got[0].Path)
+}
+
+func TestResolve_OnlyMatchesSamePageType(t *testing.T) {
+	proposed := []wiki.Page{
+		{Path: "wiki/concepts/ryan-singer.md", Content: "---\ntitle: Ryan Singer\n---\n"},
+	}
+	inventory := map[wiki.PageType][]wiki.Entry{
+		wiki.PageTypeEntity: {
+			{Slug: "ryan-singer", Title: "Ryan Singer", Aliases: nil},
+		},
+		wiki.PageTypeConcept: {},
+	}
+	got := Resolve(proposed, inventory)
+	assert.Len(t, got, 1)
+	assert.Equal(t, "wiki/concepts/ryan-singer.md", got[0].Path)
+}
+
+func TestResolve_EmptyInventory(t *testing.T) {
+	proposed := []wiki.Page{
+		{Path: "wiki/entities/first.md", Content: "---\ntitle: First\n---\n"},
+	}
+	inventory := map[wiki.PageType][]wiki.Entry{}
+	got := Resolve(proposed, inventory)
+	assert.Equal(t, proposed, got)
+}
--- a/ingestion/internal/watcher/watcher.go
+++ b/ingestion/internal/watcher/watcher.go
@@ -12,6 +12,7 @@ import (
 	"time"
 	"unicode"

+	"github.com/mathiasbq/hyperguild/ingestion/internal/extract"
 	"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
 )

@@ -99,12 +100,12 @@ func processFile(ctx context.Context, cfg Config, path, date string) error {
 	filename := filepath.Base(path)
 	source := deriveSource(filename)

-	content, err := os.ReadFile(path)
+	content, err := extract.Text(path)
 	if err != nil {
-		return fmt.Errorf("read file: %w", err)
+		return fmt.Errorf("extract text: %w", err)
 	}

-	_, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, string(content), source, false)
+	_, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, content, source, false)
 	if runErr != nil {
 		// Copy to failed/ and leave a .failed marker so we don't retry.
 		failedDir := filepath.Join(cfg.BrainDir, "raw", "failed")
--- a/ingestion/internal/wiki/inventory.go
+++ b/ingestion/internal/wiki/inventory.go
@@ -32,23 +32,26 @@ func LoadInventory(brainDir string) (map[PageType][]Entry, error) {
 			}
 			slug := strings.TrimSuffix(e.Name(), ".md")
 			path := filepath.Join(dir, e.Name())
-			title := readTitle(path, slug)
-			result[pt] = append(result[pt], Entry{Slug: slug, Title: title, Type: pt})
+			title, aliases := readFrontmatter(path, slug)
+			result[pt] = append(result[pt], Entry{Slug: slug, Title: title, Aliases: aliases, Type: pt})
 		}
 	}
 	return result, nil
 }

-// readTitle extracts the title from YAML frontmatter, falling back to slug.
-func readTitle(path, fallback string) string {
+// readFrontmatter extracts title and aliases from YAML frontmatter.
+// Falls back to slug for title and empty aliases on any error.
+func readFrontmatter(path, fallbackSlug string) (title string, aliases []string) {
+	title = fallbackSlug
 	f, err := os.Open(path)
 	if err != nil {
-		return fallback
+		return
 	}
 	defer f.Close()

 	scanner := bufio.NewScanner(f)
 	inFM := false
+	inAliases := false
 	for scanner.Scan() {
 		line := scanner.Text()
 		if strings.TrimSpace(line) == "---" {
@@ -56,14 +59,32 @@ func readTitle(path, fallback string) string {
 				inFM = true
 				continue
 			}
-			break
+			break // end of frontmatter
 		}
-		if inFM {
-			key, val, ok := strings.Cut(line, ":")
-			if ok && strings.TrimSpace(key) == "title" {
-				return strings.Trim(strings.TrimSpace(val), `"'`)
+		if !inFM {
+			continue
+		}
+
+		// Detect alias list items (lines starting with "  - ").
+		if inAliases {
+			trimmed := strings.TrimSpace(line)
+			if strings.HasPrefix(trimmed, "- ") {
+				aliases = append(aliases, strings.TrimPrefix(trimmed, "- "))
+				continue
 			}
+			inAliases = false // end of alias block
+		}
+
+		key, val, ok := strings.Cut(line, ":")
+		if !ok {
+			continue
+		}
+		switch strings.TrimSpace(key) {
+		case "title":
+			title = strings.Trim(strings.TrimSpace(val), `"'`)
+		case "aliases":
+			inAliases = true
 		}
 	}
-	return fallback
+	return
 }
--- a/ingestion/internal/wiki/inventory_test.go
+++ b/ingestion/internal/wiki/inventory_test.go
@@ -60,3 +60,24 @@ func TestLoadInventory_MissingDirsOk(t *testing.T) {
 	require.NoError(t, err)
 	assert.NotNil(t, inv)
 }
+
+func TestLoadInventory_ReadsAliases(t *testing.T) {
+	dir := t.TempDir()
+	require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "entities"), 0o755))
+	require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "concepts"), 0o755))
+	require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "sources"), 0o755))
+
+	require.NoError(t, os.WriteFile(
+		filepath.Join(dir, "wiki", "entities", "ryan-singer.md"),
+		[]byte("---\ntitle: Ryan Singer\naliases:\n  - Singer\n  - R. Singer\n---\n\n## Description\n\nDesigner.\n"),
+		0o644,
+	))
+
+	inv, err := LoadInventory(dir)
+	require.NoError(t, err)
+
+	require.Len(t, inv[PageTypeEntity], 1)
+	e := inv[PageTypeEntity][0]
+	assert.Equal(t, "Ryan Singer", e.Title)
+	assert.Equal(t, []string{"Singer", "R. Singer"}, e.Aliases)
+}
--- a/ingestion/internal/wiki/slug.go
+++ b/ingestion/internal/wiki/slug.go
@@ -21,7 +21,7 @@ func Slug(title string) string {
 		case unicode.IsLetter(r) || unicode.IsDigit(r):
 			b.WriteRune(r)
 			prevHyphen = false
-		// all other characters (apostrophes, colons, dots, etc.) are dropped
+			// all other characters (apostrophes, colons, dots, etc.) are dropped
 		}
 	}
 	return strings.TrimRight(b.String(), "-")
--- a/ingestion/internal/wiki/types.go
+++ b/ingestion/internal/wiki/types.go
@@ -18,7 +18,8 @@ type Page struct {

 // Entry is a summary of an existing wiki page used to build the inventory.
 type Entry struct {
-	Slug  string
-	Title string
-	Type  PageType
+	Slug    string
+	Title   string
+	Aliases []string
+	Type    PageType
 }