feat(extract): implement PDF extraction via pdftotext

This commit is contained in:
Mathias Bergqvist
2026-04-23 15:53:46 +02:00
parent 43a46d07e5
commit 9cc6c2d053
2 changed files with 45 additions and 3 deletions

View File

@@ -1,8 +1,28 @@
// ingestion/internal/extract/pdf.go
package extract
import "fmt"
import (
"bytes"
"fmt"
"os/exec"
"strings"
)
func extractPDF(_ string) (string, error) {
return "", fmt.Errorf("PDF extraction not implemented")
// extractPDF runs pdftotext on path and returns the extracted text.
// pdftotext must be installed (package: poppler-utils on Alpine/Debian, poppler on Homebrew).
func extractPDF(path string) (string, error) {
cmd := exec.Command("pdftotext", "-q", path, "-")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
errMsg := strings.TrimSpace(stderr.String())
if errMsg == "" {
errMsg = err.Error()
}
return "", fmt.Errorf("pdftotext: %s", errMsg)
}
return strings.TrimSpace(stdout.String()), nil
}