63 lines
1.9 KiB
Go
63 lines
1.9 KiB
Go
// ingestion/internal/extract/extract_test.go
|
|
package extract
|
|
|
|
import (
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
func TestText_Markdown(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "note.md")
|
|
require.NoError(t, os.WriteFile(path, []byte("# Hello\n\nWorld."), 0o644))
|
|
|
|
got, err := Text(path)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, "# Hello\n\nWorld.", got)
|
|
}
|
|
|
|
func TestText_Txt(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "note.txt")
|
|
require.NoError(t, os.WriteFile(path, []byte("plain text"), 0o644))
|
|
|
|
got, err := Text(path)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, "plain text", got)
|
|
}
|
|
|
|
func TestText_UnsupportedExtension(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "data.csv")
|
|
require.NoError(t, os.WriteFile(path, []byte("a,b,c"), 0o644))
|
|
|
|
_, err := Text(path)
|
|
assert.ErrorContains(t, err, "unsupported")
|
|
}
|
|
|
|
func TestText_PDF(t *testing.T) {
|
|
if _, err := exec.LookPath("pdftotext"); err != nil {
|
|
t.Skip("pdftotext not available")
|
|
}
|
|
dir := t.TempDir()
|
|
pdfPath := filepath.Join(dir, "test.pdf")
|
|
|
|
// Minimal valid PDF containing the text "Hello PDF".
|
|
minimalPDF := "%PDF-1.4\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n" +
|
|
"2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n" +
|
|
"3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Contents 4 0 R/Resources<</Font<</F1<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>>>>>>>endobj\n" +
|
|
"4 0 obj<</Length 44>>\nstream\nBT /F1 12 Tf 100 700 Td (Hello PDF) Tj ET\nendstream\nendobj\n" +
|
|
"xref\n0 5\n0000000000 65535 f\n0000000009 00000 n\n0000000058 00000 n\n0000000115 00000 n\n0000000310 00000 n\n" +
|
|
"trailer<</Size 5/Root 1 0 R>>\nstartxref\n406\n%%EOF\n"
|
|
require.NoError(t, os.WriteFile(pdfPath, []byte(minimalPDF), 0o644))
|
|
|
|
got, err := Text(pdfPath)
|
|
require.NoError(t, err)
|
|
assert.Contains(t, got, "Hello PDF")
|
|
}
|