Files
hyperguild/ingestion/internal/extract/pdf.go
2026-04-23 15:53:46 +02:00

29 lines
677 B
Go

// ingestion/internal/extract/pdf.go
package extract
import (
"bytes"
"fmt"
"os/exec"
"strings"
)
// extractPDF runs pdftotext on path and returns the extracted text.
// pdftotext must be installed (package: poppler-utils on Alpine/Debian, poppler on Homebrew).
func extractPDF(path string) (string, error) {
cmd := exec.Command("pdftotext", "-q", path, "-")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
errMsg := strings.TrimSpace(stderr.String())
if errMsg == "" {
errMsg = err.Error()
}
return "", fmt.Errorf("pdftotext: %s", errMsg)
}
return strings.TrimSpace(stdout.String()), nil
}