Compare commits
2 Commits
923a665365
...
v0.5.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0a70d9e972 | ||
|
|
3e9a648115 |
@@ -68,6 +68,7 @@ func main() {
|
||||
mux.HandleFunc("POST /write", h.Write)
|
||||
mux.HandleFunc("POST /ingest", h.Ingest)
|
||||
mux.HandleFunc("POST /ingest-path", h.IngestPath)
|
||||
mux.HandleFunc("POST /ingest-raw", h.IngestRaw)
|
||||
mux.HandleFunc("POST /backfill-refs", h.BackfillRefs)
|
||||
|
||||
addr := ":" + port
|
||||
|
||||
@@ -272,6 +272,48 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, ingestResponse{Pages: allPages, Warnings: allWarnings})
|
||||
}
|
||||
|
||||
type ingestRawRequest struct {
|
||||
Source string `json:"source"`
|
||||
Pages []pipeline.RawPage `json:"pages"`
|
||||
DryRun bool `json:"dry_run"`
|
||||
}
|
||||
|
||||
// IngestRaw handles POST /ingest-raw — run the pipeline on pre-parsed RawPages,
|
||||
// skipping the LLM extraction step. Use when the caller has already produced
|
||||
// structured page data (e.g. from a more capable model or manual curation).
|
||||
func (h *Handler) IngestRaw(w http.ResponseWriter, r *http.Request) {
|
||||
var req ingestRawRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if strings.TrimSpace(req.Source) == "" {
|
||||
writeError(w, http.StatusBadRequest, "source is required")
|
||||
return
|
||||
}
|
||||
if len(req.Pages) == 0 {
|
||||
writeError(w, http.StatusBadRequest, "pages is required and must be non-empty")
|
||||
return
|
||||
}
|
||||
|
||||
result, err := pipeline.RunRaw(h.brainDir, req.Source, req.Pages, req.DryRun)
|
||||
if err != nil {
|
||||
h.logger.Error("ingest-raw failed", "source", req.Source, "err", err)
|
||||
writeError(w, http.StatusInternalServerError, "ingest error")
|
||||
return
|
||||
}
|
||||
|
||||
pages := result.Pages
|
||||
if pages == nil {
|
||||
pages = []string{}
|
||||
}
|
||||
warnings := result.Warnings
|
||||
if warnings == nil {
|
||||
warnings = []string{}
|
||||
}
|
||||
writeJSON(w, ingestResponse{Pages: pages, Warnings: warnings})
|
||||
}
|
||||
|
||||
// BackfillRefs handles POST /backfill-refs — injects source back-references
|
||||
// into all concept and entity pages based on existing wiki/sources/ pages.
|
||||
func (h *Handler) BackfillRefs(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
@@ -226,6 +226,85 @@ func TestIngestPath_File(t *testing.T) {
|
||||
assert.NotEmpty(t, pagesSlice)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// POST /ingest-raw
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestIngestRaw_Validation(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
body map[string]any
|
||||
}{
|
||||
{"missing source", map[string]any{"pages": []any{map[string]any{"title": "X", "type": "concept", "content": "x"}}}},
|
||||
{"missing pages", map[string]any{"source": "test-source"}},
|
||||
{"empty pages", map[string]any{"source": "test-source", "pages": []any{}}},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
_, h := setup(t)
|
||||
body, _ := json.Marshal(tc.body)
|
||||
req := httptest.NewRequest(http.MethodPost, "/ingest-raw", bytes.NewReader(body))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.IngestRaw(rec, req)
|
||||
|
||||
assert.Equal(t, http.StatusBadRequest, rec.Code)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestIngestRaw_Success(t *testing.T) {
|
||||
dir, h := setup(t)
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"source": "test-article",
|
||||
"pages": []any{
|
||||
map[string]any{"title": "Test Article", "type": "source", "subtype": "article", "domain": "Testing", "content": "## Summary\n\nThis is a test article about [[Test Concept]].\n"},
|
||||
map[string]any{"title": "Test Concept", "type": "concept", "domain": "Testing", "content": "A concept for testing.\n"},
|
||||
},
|
||||
})
|
||||
req := httptest.NewRequest(http.MethodPost, "/ingest-raw", bytes.NewReader(body))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.IngestRaw(rec, req)
|
||||
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
var resp map[string]any
|
||||
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &resp))
|
||||
pages := resp["pages"].([]any)
|
||||
assert.Len(t, pages, 2)
|
||||
|
||||
// Verify files were written
|
||||
sourcePath := filepath.Join(dir, "wiki", "sources", "test-article.md")
|
||||
assert.FileExists(t, sourcePath)
|
||||
conceptPath := filepath.Join(dir, "wiki", "concepts", "test-concept.md")
|
||||
assert.FileExists(t, conceptPath)
|
||||
}
|
||||
|
||||
func TestIngestRaw_DryRun(t *testing.T) {
|
||||
dir, h := setup(t)
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"source": "dry-run-test",
|
||||
"pages": []any{
|
||||
map[string]any{"title": "Dry Run Source", "type": "source", "subtype": "article", "content": "Content."},
|
||||
},
|
||||
"dry_run": true,
|
||||
})
|
||||
req := httptest.NewRequest(http.MethodPost, "/ingest-raw", bytes.NewReader(body))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.IngestRaw(rec, req)
|
||||
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
var resp map[string]any
|
||||
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &resp))
|
||||
pages := resp["pages"].([]any)
|
||||
assert.NotEmpty(t, pages)
|
||||
|
||||
// Verify no files were written
|
||||
sourcePath := filepath.Join(dir, "wiki", "sources", "dry-run-test.md")
|
||||
assert.NoFileExists(t, sourcePath)
|
||||
}
|
||||
|
||||
func TestIngestPath_Directory(t *testing.T) {
|
||||
_, h := setup(t)
|
||||
|
||||
|
||||
@@ -18,7 +18,8 @@ type RawPage struct {
|
||||
}
|
||||
|
||||
// ParseRawPages parses LLM output as a JSON array of RawPage objects.
|
||||
// If the array is truncated mid-object (token limit), it salvages all complete objects.
|
||||
// If the output contains invalid JSON escape sequences (e.g. \. from Markdown),
|
||||
// it attempts repair before falling back to truncation recovery.
|
||||
func ParseRawPages(output string) ([]RawPage, []string) {
|
||||
output = strings.TrimSpace(output)
|
||||
if output == "" {
|
||||
@@ -27,23 +28,30 @@ func ParseRawPages(output string) ([]RawPage, []string) {
|
||||
|
||||
output = stripFences(output)
|
||||
|
||||
// Fast path: valid JSON.
|
||||
var pages []RawPage
|
||||
if err := json.Unmarshal([]byte(output), &pages); err == nil {
|
||||
return pages, nil
|
||||
}
|
||||
|
||||
// Repair pass: fix invalid escape sequences (e.g. \. \d from Markdown content).
|
||||
repaired := repairJSON(output)
|
||||
if err := json.Unmarshal([]byte(repaired), &pages); err == nil {
|
||||
return pages, []string{"repaired invalid JSON escape sequences in LLM output"}
|
||||
}
|
||||
|
||||
// Truncation recovery: find last `}` that closes a complete object.
|
||||
idx := strings.LastIndex(output, "}")
|
||||
idx := strings.LastIndex(repaired, "}")
|
||||
if idx < 0 {
|
||||
return nil, []string{"LLM output contained no complete JSON objects"}
|
||||
}
|
||||
|
||||
start := strings.Index(output, "[")
|
||||
start := strings.Index(repaired, "[")
|
||||
if start < 0 {
|
||||
return nil, []string{"LLM output contained no JSON array opening bracket"}
|
||||
}
|
||||
|
||||
candidate := output[start:idx+1] + "]"
|
||||
candidate := repaired[start:idx+1] + "]"
|
||||
if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
|
||||
return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
|
||||
}
|
||||
@@ -51,6 +59,45 @@ func ParseRawPages(output string) ([]RawPage, []string) {
|
||||
return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
|
||||
}
|
||||
|
||||
// repairJSON replaces invalid JSON escape sequences (e.g. \. \d \p) with
|
||||
// a properly escaped backslash followed by the same character.
|
||||
// It iterates byte-by-byte to correctly skip already-valid escape sequences
|
||||
// (including \\) without requiring lookbehind support.
|
||||
func repairJSON(s string) string {
|
||||
var b strings.Builder
|
||||
b.Grow(len(s))
|
||||
i := 0
|
||||
for i < len(s) {
|
||||
if s[i] != '\\' {
|
||||
b.WriteByte(s[i])
|
||||
i++
|
||||
continue
|
||||
}
|
||||
// We have a backslash. Peek at the next character.
|
||||
if i+1 >= len(s) {
|
||||
// Trailing backslash — emit as-is.
|
||||
b.WriteByte(s[i])
|
||||
i++
|
||||
continue
|
||||
}
|
||||
next := s[i+1]
|
||||
switch next {
|
||||
case '"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u':
|
||||
// Valid JSON escape sequence — emit both characters as-is.
|
||||
b.WriteByte(s[i])
|
||||
b.WriteByte(next)
|
||||
i += 2
|
||||
default:
|
||||
// Invalid escape — double the backslash.
|
||||
b.WriteByte('\\')
|
||||
b.WriteByte('\\')
|
||||
b.WriteByte(next)
|
||||
i += 2
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func stripFences(s string) string {
|
||||
for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
|
||||
if strings.HasPrefix(s, prefix) {
|
||||
|
||||
@@ -59,3 +59,29 @@ func TestParseRawPages_MissingTitle(t *testing.T) {
|
||||
assert.Empty(t, warnings)
|
||||
assert.Empty(t, pages[0].Title)
|
||||
}
|
||||
|
||||
func TestParseRawPages_InvalidEscapeRepaired(t *testing.T) {
|
||||
// LLM copied markdown escaped list numbers (\.) into JSON — invalid escape
|
||||
raw := "[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"Step 4\\. Do it.\"}]"
|
||||
pages, warnings := ParseRawPages(raw)
|
||||
require.Len(t, pages, 1)
|
||||
assert.Equal(t, "Foo", pages[0].Title)
|
||||
assert.Contains(t, pages[0].Content, `4\.`)
|
||||
assert.NotEmpty(t, warnings) // repair warning
|
||||
}
|
||||
|
||||
func TestRepairJSON_FixesInvalidEscapes(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
want string
|
||||
}{
|
||||
{`{"a":"foo\.bar"}`, `{"a":"foo\\.bar"}`},
|
||||
{`{"a":"\\n is fine"}`, `{"a":"\\n is fine"}`}, // valid \n untouched
|
||||
{`{"a":"\d+ items"}`, `{"a":"\\d+ items"}`},
|
||||
{`{"a":"already \\ escaped"}`, `{"a":"already \\ escaped"}`}, // valid \\ untouched
|
||||
}
|
||||
for _, tc := range cases {
|
||||
got := repairJSON(tc.in)
|
||||
assert.Equal(t, tc.want, got, "input: %s", tc.in)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,11 +59,31 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
|
||||
allWarnings = append(allWarnings, warnings...)
|
||||
}
|
||||
|
||||
pages, buildWarnings := BuildPages(allRaw, sourceSlug, date)
|
||||
allWarnings = append(allWarnings, buildWarnings...)
|
||||
return buildAndWrite(allRaw, sourceSlug, date, brainDir, source, inventory, allWarnings, dryRun)
|
||||
}
|
||||
|
||||
// RunRaw runs the pipeline on pre-parsed RawPages, skipping the LLM extraction
|
||||
// step. Use this when the caller has already produced the structured RawPage data
|
||||
// (e.g. from a more capable model or manual curation).
|
||||
func RunRaw(brainDir, source string, rawPages []RawPage, dryRun bool) (Result, error) {
|
||||
inventory, err := wiki.LoadInventory(brainDir)
|
||||
if err != nil {
|
||||
return Result{}, fmt.Errorf("load inventory: %w", err)
|
||||
}
|
||||
|
||||
sourceSlug := wiki.Slug(source)
|
||||
date := time.Now().UTC().Format("2006-01-02")
|
||||
|
||||
return buildAndWrite(rawPages, sourceSlug, date, brainDir, source, inventory, nil, dryRun)
|
||||
}
|
||||
|
||||
// buildAndWrite runs BuildPages through write for both Run and RunRaw.
|
||||
func buildAndWrite(rawPages []RawPage, sourceSlug, date, brainDir, source string, inventory map[wiki.PageType][]wiki.Entry, warnings []string, dryRun bool) (Result, error) {
|
||||
pages, buildWarnings := BuildPages(rawPages, sourceSlug, date)
|
||||
warnings = append(warnings, buildWarnings...)
|
||||
resolved := Resolve(pages, inventory)
|
||||
canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory)
|
||||
allWarnings = append(allWarnings, linkWarnings...)
|
||||
warnings = append(warnings, linkWarnings...)
|
||||
withRefs := injectSourceRefs(canonicalized, inventory, brainDir)
|
||||
merged := mergeAll(withRefs)
|
||||
|
||||
@@ -83,14 +103,14 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
|
||||
|
||||
if !dryRun {
|
||||
if err := wiki.RebuildIndex(brainDir, date); err != nil {
|
||||
allWarnings = append(allWarnings, fmt.Sprintf("rebuild index: %v", err))
|
||||
warnings = append(warnings, fmt.Sprintf("rebuild index: %v", err))
|
||||
}
|
||||
if err := wiki.AppendLog(brainDir, source, written, allWarnings, date); err != nil {
|
||||
allWarnings = append(allWarnings, fmt.Sprintf("append log: %v", err))
|
||||
if err := wiki.AppendLog(brainDir, source, written, warnings, date); err != nil {
|
||||
warnings = append(warnings, fmt.Sprintf("append log: %v", err))
|
||||
}
|
||||
}
|
||||
|
||||
return Result{Pages: written, Warnings: allWarnings}, nil
|
||||
return Result{Pages: written, Warnings: warnings}, nil
|
||||
}
|
||||
|
||||
// mergeAll deduplicates pages by path, merging content from later occurrences.
|
||||
|
||||
@@ -17,6 +17,8 @@ func (s *Skill) Handle(ctx context.Context, tool string, args json.RawMessage) (
|
||||
return s.query(ctx, args)
|
||||
case "brain_write":
|
||||
return s.write(ctx, args)
|
||||
case "brain_ingest_raw":
|
||||
return s.ingestRaw(ctx, args)
|
||||
case "brain_ingest":
|
||||
return s.ingest(ctx, args)
|
||||
case "brain_search":
|
||||
@@ -98,6 +100,33 @@ func (s *Skill) ingest(ctx context.Context, args json.RawMessage) (json.RawMessa
|
||||
return nil, fmt.Errorf("either content+source or path is required")
|
||||
}
|
||||
|
||||
type ingestRawArgs struct {
|
||||
Source string `json:"source"`
|
||||
Pages []any `json:"pages"`
|
||||
DryRun bool `json:"dry_run,omitempty"`
|
||||
}
|
||||
|
||||
func (s *Skill) ingestRaw(ctx context.Context, args json.RawMessage) (json.RawMessage, error) {
|
||||
var a ingestRawArgs
|
||||
if err := json.Unmarshal(args, &a); err != nil {
|
||||
return nil, fmt.Errorf("parse args: %w", err)
|
||||
}
|
||||
if s.cfg.IngestSvcURL == "" {
|
||||
return nil, fmt.Errorf("brain_ingest_raw: INGEST_SVC_URL not configured")
|
||||
}
|
||||
if a.Source == "" {
|
||||
return nil, fmt.Errorf("source is required")
|
||||
}
|
||||
if len(a.Pages) == 0 {
|
||||
return nil, fmt.Errorf("pages is required and must be non-empty")
|
||||
}
|
||||
return s.postTo(ctx, s.cfg.IngestSvcURL+"/ingest-raw", map[string]any{
|
||||
"source": a.Source,
|
||||
"pages": a.Pages,
|
||||
"dry_run": a.DryRun,
|
||||
})
|
||||
}
|
||||
|
||||
type searchArgs struct {
|
||||
Query string `json:"query"`
|
||||
Collection string `json:"collection,omitempty"`
|
||||
|
||||
@@ -55,6 +55,32 @@ func (s *Skill) Tools() []registry.ToolDef {
|
||||
},
|
||||
}
|
||||
if s.cfg.IngestSvcURL != "" {
|
||||
tools = append(tools, registry.ToolDef{
|
||||
Name: "brain_ingest_raw",
|
||||
Description: "Ingest pre-structured pages into the brain wiki, bypassing the LLM extraction step. " +
|
||||
"Use when you (the calling agent) have already extracted entities, concepts, and content from a source. " +
|
||||
"Provide source (human-readable name) and pages (array of {title, type, subtype, domain, content} objects). " +
|
||||
"The pipeline computes slugs, paths, frontmatter, wikilink canonicalization, and source back-references. " +
|
||||
"Returns the list of wiki pages written.",
|
||||
InputSchema: schema([]string{"source", "pages"}, map[string]any{
|
||||
"source": map[string]any{"type": "string", "description": "human-readable name for the source, e.g. 'shape-up-book'"},
|
||||
"pages": map[string]any{
|
||||
"type": "array",
|
||||
"items": map[string]any{
|
||||
"type": "object",
|
||||
"required": []string{"title", "type", "content"},
|
||||
"properties": map[string]any{
|
||||
"title": map[string]any{"type": "string", "description": "page title, e.g. 'Hash Encoding'"},
|
||||
"type": map[string]any{"type": "string", "enum": []string{"source", "concept", "entity"}, "description": "page type"},
|
||||
"subtype": map[string]any{"type": "string", "description": "entity: person|company|tool|model|framework|technology; source: article|pdf|book|video|note|project"},
|
||||
"domain": map[string]any{"type": "string", "description": "knowledge domain, e.g. 'Machine Learning'"},
|
||||
"content": map[string]any{"type": "string", "description": "markdown body — no frontmatter, use [[Display Name]] for wikilinks"},
|
||||
},
|
||||
},
|
||||
},
|
||||
"dry_run": map[string]any{"type": "boolean"},
|
||||
}),
|
||||
})
|
||||
tools = append(tools, registry.ToolDef{
|
||||
Name: "brain_ingest",
|
||||
Description: "Ingest content into the brain wiki (brain/wiki/). Calls an LLM to produce structured wiki pages. " +
|
||||
|
||||
Reference in New Issue
Block a user