feat(watcher,api): use extract.Text() for file reading — fixes PDF ingestion

This commit is contained in:
Mathias Bergqvist
2026-04-23 16:01:36 +02:00
parent 53e46781b1
commit 2975eadc87
2 changed files with 11 additions and 9 deletions

View File

@@ -11,6 +11,7 @@ import (
"strings" "strings"
"time" "time"
"github.com/mathiasbq/hyperguild/ingestion/internal/extract"
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline" "github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
"github.com/mathiasbq/hyperguild/ingestion/internal/search" "github.com/mathiasbq/hyperguild/ingestion/internal/search"
) )
@@ -214,16 +215,16 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) {
if !supportedExtensions[ext] { if !supportedExtensions[ext] {
return nil return nil
} }
content, readErr := os.ReadFile(path) content, readErr := extract.Text(path)
if readErr != nil { if readErr != nil {
allWarnings = append(allWarnings, fmt.Sprintf("read %s: %v", path, readErr)) allWarnings = append(allWarnings, fmt.Sprintf("extract %s: %v", path, readErr))
return nil return nil
} }
source := req.Source source := req.Source
if source == "" { if source == "" {
source = filepath.Base(path) source = filepath.Base(path)
} }
result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun) result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun)
if runErr != nil { if runErr != nil {
allWarnings = append(allWarnings, fmt.Sprintf("ingest %s: %v", path, runErr)) allWarnings = append(allWarnings, fmt.Sprintf("ingest %s: %v", path, runErr))
return nil return nil
@@ -243,16 +244,16 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) {
writeError(w, http.StatusBadRequest, fmt.Sprintf("unsupported file extension: %s", ext)) writeError(w, http.StatusBadRequest, fmt.Sprintf("unsupported file extension: %s", ext))
return return
} }
content, readErr := os.ReadFile(req.Path) content, readErr := extract.Text(req.Path)
if readErr != nil { if readErr != nil {
writeError(w, http.StatusInternalServerError, fmt.Sprintf("read file: %v", readErr)) writeError(w, http.StatusInternalServerError, fmt.Sprintf("extract text: %v", readErr))
return return
} }
source := req.Source source := req.Source
if source == "" { if source == "" {
source = filepath.Base(req.Path) source = filepath.Base(req.Path)
} }
result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun) result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun)
if runErr != nil { if runErr != nil {
h.logger.Error("ingest-path failed", "path", req.Path, "err", runErr) h.logger.Error("ingest-path failed", "path", req.Path, "err", runErr)
writeError(w, http.StatusInternalServerError, "ingest error") writeError(w, http.StatusInternalServerError, "ingest error")

View File

@@ -11,6 +11,7 @@ import (
"time" "time"
"unicode" "unicode"
"github.com/mathiasbq/hyperguild/ingestion/internal/extract"
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline" "github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
) )
@@ -88,12 +89,12 @@ func processFile(ctx context.Context, cfg Config, path, date string) error {
filename := filepath.Base(path) filename := filepath.Base(path)
source := deriveSource(filename) source := deriveSource(filename)
content, err := os.ReadFile(path) content, err := extract.Text(path)
if err != nil { if err != nil {
return fmt.Errorf("read file: %w", err) return fmt.Errorf("extract text: %w", err)
} }
_, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, string(content), source, false) _, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, content, source, false)
if runErr != nil { if runErr != nil {
// Move to failed/. // Move to failed/.
failedDir := filepath.Join(cfg.BrainDir, "raw", "failed") failedDir := filepath.Join(cfg.BrainDir, "raw", "failed")