feat(watcher,api): use extract.Text() for file reading — fixes PDF ingestion
This commit is contained in:
@@ -11,6 +11,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/mathiasbq/hyperguild/ingestion/internal/extract"
|
||||||
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
|
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
|
||||||
"github.com/mathiasbq/hyperguild/ingestion/internal/search"
|
"github.com/mathiasbq/hyperguild/ingestion/internal/search"
|
||||||
)
|
)
|
||||||
@@ -214,16 +215,16 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) {
|
|||||||
if !supportedExtensions[ext] {
|
if !supportedExtensions[ext] {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
content, readErr := os.ReadFile(path)
|
content, readErr := extract.Text(path)
|
||||||
if readErr != nil {
|
if readErr != nil {
|
||||||
allWarnings = append(allWarnings, fmt.Sprintf("read %s: %v", path, readErr))
|
allWarnings = append(allWarnings, fmt.Sprintf("extract %s: %v", path, readErr))
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
source := req.Source
|
source := req.Source
|
||||||
if source == "" {
|
if source == "" {
|
||||||
source = filepath.Base(path)
|
source = filepath.Base(path)
|
||||||
}
|
}
|
||||||
result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun)
|
result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun)
|
||||||
if runErr != nil {
|
if runErr != nil {
|
||||||
allWarnings = append(allWarnings, fmt.Sprintf("ingest %s: %v", path, runErr))
|
allWarnings = append(allWarnings, fmt.Sprintf("ingest %s: %v", path, runErr))
|
||||||
return nil
|
return nil
|
||||||
@@ -243,16 +244,16 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) {
|
|||||||
writeError(w, http.StatusBadRequest, fmt.Sprintf("unsupported file extension: %s", ext))
|
writeError(w, http.StatusBadRequest, fmt.Sprintf("unsupported file extension: %s", ext))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
content, readErr := os.ReadFile(req.Path)
|
content, readErr := extract.Text(req.Path)
|
||||||
if readErr != nil {
|
if readErr != nil {
|
||||||
writeError(w, http.StatusInternalServerError, fmt.Sprintf("read file: %v", readErr))
|
writeError(w, http.StatusInternalServerError, fmt.Sprintf("extract text: %v", readErr))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
source := req.Source
|
source := req.Source
|
||||||
if source == "" {
|
if source == "" {
|
||||||
source = filepath.Base(req.Path)
|
source = filepath.Base(req.Path)
|
||||||
}
|
}
|
||||||
result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, string(content), source, req.DryRun)
|
result, runErr := pipeline.Run(r.Context(), h.pipeline, h.brainDir, content, source, req.DryRun)
|
||||||
if runErr != nil {
|
if runErr != nil {
|
||||||
h.logger.Error("ingest-path failed", "path", req.Path, "err", runErr)
|
h.logger.Error("ingest-path failed", "path", req.Path, "err", runErr)
|
||||||
writeError(w, http.StatusInternalServerError, "ingest error")
|
writeError(w, http.StatusInternalServerError, "ingest error")
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
"unicode"
|
"unicode"
|
||||||
|
|
||||||
|
"github.com/mathiasbq/hyperguild/ingestion/internal/extract"
|
||||||
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
|
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -88,12 +89,12 @@ func processFile(ctx context.Context, cfg Config, path, date string) error {
|
|||||||
filename := filepath.Base(path)
|
filename := filepath.Base(path)
|
||||||
source := deriveSource(filename)
|
source := deriveSource(filename)
|
||||||
|
|
||||||
content, err := os.ReadFile(path)
|
content, err := extract.Text(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("read file: %w", err)
|
return fmt.Errorf("extract text: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, string(content), source, false)
|
_, runErr := pipeline.Run(ctx, cfg.Pipeline, cfg.BrainDir, content, source, false)
|
||||||
if runErr != nil {
|
if runErr != nil {
|
||||||
// Move to failed/.
|
// Move to failed/.
|
||||||
failedDir := filepath.Join(cfg.BrainDir, "raw", "failed")
|
failedDir := filepath.Join(cfg.BrainDir, "raw", "failed")
|
||||||
|
|||||||
Reference in New Issue
Block a user