diff --git a/ingestion/internal/api/handler.go b/ingestion/internal/api/handler.go index 0637cde..141ca0a 100644 --- a/ingestion/internal/api/handler.go +++ b/ingestion/internal/api/handler.go @@ -11,6 +11,7 @@ import ( "strings" "time" + "github.com/mathiasbq/hyperguild/ingestion/internal/extract" "github.com/mathiasbq/hyperguild/ingestion/internal/pipeline" "github.com/mathiasbq/hyperguild/ingestion/internal/search" ) @@ -214,16 +215,16 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) { if !supportedExtensions[ext] { return nil } - content, readErr := os.ReadFile(path) + content, readErr := extract.Text(path) if readErr != nil { - allWarnings = append(allWarnings, fmt.Sprintf("read %s: %v", path, readErr)) + allWarnings = append(allWarnings, fmt.Sprintf("extract %s: %v", path, readErr)) return nil } source := req.Source if source == "" { source = filepath.Base(path) } - result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun) + result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun) if runErr != nil { allWarnings = append(allWarnings, fmt.Sprintf("ingest %s: %v", path, runErr)) return nil @@ -243,16 +244,16 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusBadRequest, fmt.Sprintf("unsupported file extension: %s", ext)) return } - content, readErr := os.ReadFile(req.Path) + content, readErr := extract.Text(req.Path) if readErr != nil { - writeError(w, http.StatusInternalServerError, fmt.Sprintf("read file: %v", readErr)) + writeError(w, http.StatusInternalServerError, fmt.Sprintf("extract text: %v", readErr)) return } source := req.Source if source == "" { source = filepath.Base(req.Path) } - result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun) + result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun) if runErr != nil { h.logger.Error("ingest-path failed", "path", req.Path, "err", runErr) writeError(w, http.StatusInternalServerError, "ingest error") diff --git a/ingestion/internal/watcher/watcher.go b/ingestion/internal/watcher/watcher.go index a65db79..2a1c0fb 100644 --- a/ingestion/internal/watcher/watcher.go +++ b/ingestion/internal/watcher/watcher.go @@ -11,6 +11,7 @@ import ( "time" "unicode" + "github.com/mathiasbq/hyperguild/ingestion/internal/extract" "github.com/mathiasbq/hyperguild/ingestion/internal/pipeline" ) @@ -88,12 +89,12 @@ func processFile(ctx context.Context, cfg Config, path, date string) error { filename := filepath.Base(path) source := deriveSource(filename) - content, err := os.ReadFile(path) + content, err := extract.Text(path) if err != nil { - return fmt.Errorf("read file: %w", err) + return fmt.Errorf("extract text: %w", err) } - _, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, string(content), source, false) + _, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, content, source, false) if runErr != nil { // Move to failed/. failedDir := filepath.Join(cfg.BrainDir, "raw", "failed")