feat(extract): implement PDF extraction via pdftotext
This commit is contained in:
@@ -3,6 +3,7 @@ package extract
|
||||
|
||||
import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
@@ -38,3 +39,24 @@ func TestText_UnsupportedExtension(t *testing.T) {
|
||||
_, err := Text(path)
|
||||
assert.ErrorContains(t, err, "unsupported")
|
||||
}
|
||||
|
||||
func TestText_PDF(t *testing.T) {
|
||||
if _, err := exec.LookPath("pdftotext"); err != nil {
|
||||
t.Skip("pdftotext not available")
|
||||
}
|
||||
dir := t.TempDir()
|
||||
pdfPath := filepath.Join(dir, "test.pdf")
|
||||
|
||||
// Minimal valid PDF containing the text "Hello PDF".
|
||||
minimalPDF := "%PDF-1.4\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n" +
|
||||
"2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n" +
|
||||
"3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Contents 4 0 R/Resources<</Font<</F1<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>>>>>>>endobj\n" +
|
||||
"4 0 obj<</Length 44>>\nstream\nBT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\nendstream\nendobj\n" +
|
||||
"xref\n0 5\n0000000000 65535 f\n0000000009 00000 n\n0000000058 00000 n\n0000000115 00000 n\n0000000310 00000 n\n" +
|
||||
"trailer<</Size 5/Root 1 0 R>>\nstartxref\n406\n%%EOF\n"
|
||||
require.NoError(t, os.WriteFile(pdfPath, []byte(minimalPDF), 0o644))
|
||||
|
||||
got, err := Text(pdfPath)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, got, "Hello PDF")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user