From 43a46d07e544b24590dd262739911b4f57627c83 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 15:45:20 +0200 Subject: [PATCH] feat(extract): add Text() dispatcher with md/txt passthrough --- ingestion/internal/extract/extract.go | 39 +++++++++++++++++++++ ingestion/internal/extract/extract_test.go | 40 ++++++++++++++++++++++ ingestion/internal/extract/pdf.go | 8 +++++ 3 files changed, 87 insertions(+) create mode 100644 ingestion/internal/extract/extract.go create mode 100644 ingestion/internal/extract/extract_test.go create mode 100644 ingestion/internal/extract/pdf.go diff --git a/ingestion/internal/extract/extract.go b/ingestion/internal/extract/extract.go new file mode 100644 index 0000000..725c85f --- /dev/null +++ b/ingestion/internal/extract/extract.go @@ -0,0 +1,39 @@ +// ingestion/internal/extract/extract.go +package extract + +import ( + "fmt" + "os" + "strings" +) + +// Text reads the file at path and returns its plain-text content. +// Supported extensions: .md, .txt (passthrough), .pdf (via pdftotext). +func Text(path string) (string, error) { + ext := strings.ToLower(fileExt(path)) + switch ext { + case ".md", ".txt": + b, err := os.ReadFile(path) + if err != nil { + return "", fmt.Errorf("read %s: %w", path, err) + } + return string(b), nil + case ".pdf": + return extractPDF(path) + default: + return "", fmt.Errorf("unsupported file extension: %s", ext) + } +} + +// fileExt returns the file extension including the dot, lowercased. +func fileExt(path string) string { + for i := len(path) - 1; i >= 0; i-- { + if path[i] == '.' { + return path[i:] + } + if path[i] == '/' || path[i] == '\\' { + break + } + } + return "" +} diff --git a/ingestion/internal/extract/extract_test.go b/ingestion/internal/extract/extract_test.go new file mode 100644 index 0000000..44cd5fb --- /dev/null +++ b/ingestion/internal/extract/extract_test.go @@ -0,0 +1,40 @@ +// ingestion/internal/extract/extract_test.go +package extract + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestText_Markdown(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "note.md") + require.NoError(t, os.WriteFile(path, []byte("# Hello\n\nWorld."), 0o644)) + + got, err := Text(path) + require.NoError(t, err) + assert.Equal(t, "# Hello\n\nWorld.", got) +} + +func TestText_Txt(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "note.txt") + require.NoError(t, os.WriteFile(path, []byte("plain text"), 0o644)) + + got, err := Text(path) + require.NoError(t, err) + assert.Equal(t, "plain text", got) +} + +func TestText_UnsupportedExtension(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "data.csv") + require.NoError(t, os.WriteFile(path, []byte("a,b,c"), 0o644)) + + _, err := Text(path) + assert.ErrorContains(t, err, "unsupported") +} diff --git a/ingestion/internal/extract/pdf.go b/ingestion/internal/extract/pdf.go new file mode 100644 index 0000000..4a073c4 --- /dev/null +++ b/ingestion/internal/extract/pdf.go @@ -0,0 +1,8 @@ +// ingestion/internal/extract/pdf.go +package extract + +import "fmt" + +func extractPDF(_ string) (string, error) { + return "", fmt.Errorf("PDF extraction not implemented") +}