diff --git a/ingestion/internal/extract/extract_test.go b/ingestion/internal/extract/extract_test.go index 44cd5fb..2ef75aa 100644 --- a/ingestion/internal/extract/extract_test.go +++ b/ingestion/internal/extract/extract_test.go @@ -3,6 +3,7 @@ package extract import ( "os" + "os/exec" "path/filepath" "testing" @@ -38,3 +39,24 @@ func TestText_UnsupportedExtension(t *testing.T) { _, err := Text(path) assert.ErrorContains(t, err, "unsupported") } + +func TestText_PDF(t *testing.T) { + if _, err := exec.LookPath("pdftotext"); err != nil { + t.Skip("pdftotext not available") + } + dir := t.TempDir() + pdfPath := filepath.Join(dir, "test.pdf") + + // Minimal valid PDF containing the text "Hello PDF". + minimalPDF := "%PDF-1.4\n1 0 obj<>endobj\n" + + "2 0 obj<>endobj\n" + + "3 0 obj<>>>>>>>endobj\n" + + "4 0 obj<>\nstream\nBT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\nendstream\nendobj\n" + + "xref\n0 5\n0000000000 65535 f\n0000000009 00000 n\n0000000058 00000 n\n0000000115 00000 n\n0000000310 00000 n\n" + + "trailer<>\nstartxref\n406\n%%EOF\n" + require.NoError(t, os.WriteFile(pdfPath, []byte(minimalPDF), 0o644)) + + got, err := Text(pdfPath) + require.NoError(t, err) + assert.Contains(t, got, "Hello PDF") +} diff --git a/ingestion/internal/extract/pdf.go b/ingestion/internal/extract/pdf.go index 4a073c4..8415a14 100644 --- a/ingestion/internal/extract/pdf.go +++ b/ingestion/internal/extract/pdf.go @@ -1,8 +1,28 @@ // ingestion/internal/extract/pdf.go package extract -import "fmt" +import ( + "bytes" + "fmt" + "os/exec" + "strings" +) -func extractPDF(_ string) (string, error) { - return "", fmt.Errorf("PDF extraction not implemented") +// extractPDF runs pdftotext on path and returns the extracted text. +// pdftotext must be installed (package: poppler-utils on Alpine/Debian, poppler on Homebrew). +func extractPDF(path string) (string, error) { + cmd := exec.Command("pdftotext", "-q", path, "-") + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + errMsg := strings.TrimSpace(stderr.String()) + if errMsg == "" { + errMsg = err.Error() + } + return "", fmt.Errorf("pdftotext: %s", errMsg) + } + + return strings.TrimSpace(stdout.String()), nil }