feat(extract): implement PDF extraction via pdftotext
This commit is contained in:
@@ -3,6 +3,7 @@ package extract
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
@@ -38,3 +39,24 @@ func TestText_UnsupportedExtension(t *testing.T) {
|
|||||||
_, err := Text(path)
|
_, err := Text(path)
|
||||||
assert.ErrorContains(t, err, "unsupported")
|
assert.ErrorContains(t, err, "unsupported")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestText_PDF(t *testing.T) {
|
||||||
|
if _, err := exec.LookPath("pdftotext"); err != nil {
|
||||||
|
t.Skip("pdftotext not available")
|
||||||
|
}
|
||||||
|
dir := t.TempDir()
|
||||||
|
pdfPath := filepath.Join(dir, "test.pdf")
|
||||||
|
|
||||||
|
// Minimal valid PDF containing the text "Hello PDF".
|
||||||
|
minimalPDF := "%PDF-1.4\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n" +
|
||||||
|
"2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n" +
|
||||||
|
"3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Contents 4 0 R/Resources<</Font<</F1<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>>>>>>>endobj\n" +
|
||||||
|
"4 0 obj<</Length 44>>\nstream\nBT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\nendstream\nendobj\n" +
|
||||||
|
"xref\n0 5\n0000000000 65535 f\n0000000009 00000 n\n0000000058 00000 n\n0000000115 00000 n\n0000000310 00000 n\n" +
|
||||||
|
"trailer<</Size 5/Root 1 0 R>>\nstartxref\n406\n%%EOF\n"
|
||||||
|
require.NoError(t, os.WriteFile(pdfPath, []byte(minimalPDF), 0o644))
|
||||||
|
|
||||||
|
got, err := Text(pdfPath)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Contains(t, got, "Hello PDF")
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,8 +1,28 @@
|
|||||||
// ingestion/internal/extract/pdf.go
|
// ingestion/internal/extract/pdf.go
|
||||||
package extract
|
package extract
|
||||||
|
|
||||||
import "fmt"
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"os/exec"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
func extractPDF(_ string) (string, error) {
|
// extractPDF runs pdftotext on path and returns the extracted text.
|
||||||
return "", fmt.Errorf("PDF extraction not implemented")
|
// pdftotext must be installed (package: poppler-utils on Alpine/Debian, poppler on Homebrew).
|
||||||
|
func extractPDF(path string) (string, error) {
|
||||||
|
cmd := exec.Command("pdftotext", "-q", path, "-")
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
cmd.Stdout = &stdout
|
||||||
|
cmd.Stderr = &stderr
|
||||||
|
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
errMsg := strings.TrimSpace(stderr.String())
|
||||||
|
if errMsg == "" {
|
||||||
|
errMsg = err.Error()
|
||||||
|
}
|
||||||
|
return "", fmt.Errorf("pdftotext: %s", errMsg)
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimSpace(stdout.String()), nil
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user