From 537aebc3028a3dae40f9c724bf01545ac7819ed7 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 19:45:21 +0200 Subject: [PATCH] feat(pipeline): update system prompt for new LLM JSON contract (no slugs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Change prompt to reflect new output format: title, type, subtype, domain, content - Remove slug/path generation responsibility from LLM — pipeline now handles it - Wikilinks change from [[slug|Display Name]] to [[Display Name]] only - LLM no longer includes frontmatter or paths in output docs(schema): update LLM output format and wikilink convention for Level 3 - Specify JSON schema: title, type, subtype, domain, content fields - Remove frontmatter requirements from schema output (handled by pipeline) - Simplify wikilink format to [[Display Name]] — no slug or pipe - Pipeline now responsible for slug generation and frontmatter construction These changes shift slug/frontmatter generation from LLM to pipeline, reducing cognitive load on the model and improving control over output. Co-Authored-By: Claude Sonnet 4.6 --- brain/schema.md | 89 +++++++++++---------------- ingestion/internal/pipeline/prompt.go | 27 ++++---- 2 files changed, 51 insertions(+), 65 deletions(-) diff --git a/brain/schema.md b/brain/schema.md index f63a8ef..7dbad14 100644 --- a/brain/schema.md +++ b/brain/schema.md @@ -3,21 +3,34 @@ This document defines the three page types in the brain wiki. The LLM must follow this schema exactly when generating wiki pages. +## Output Format + +Return a JSON array. Each element: + +```json +{ + "title": "exact page title", + "type": "source | concept | entity", + "subtype": "see below — omit for concept", + "domain": "see domains — omit if none fits", + "content": "Markdown body only — no frontmatter, no path" +} +``` + +- `subtype` for **source**: `article | pdf | book | video | note | project` +- `subtype` for **entity**: `person | company | tool | model | framework | technology` +- The pipeline computes slugs and frontmatter — never include them in output. + ## Wikilink Format -All cross-references use `[[slug|Display Text]]`. +All cross-references use `[[Display Name]]` — just the display name, no slug, no pipe. Rules: -- slug = lowercase filename without .md, spaces → hyphens, strip all non-alphanumeric except hyphens -- The `|` separator is REQUIRED — never use `[[Title]]` without a slug -- Examples: `[[domain-driven-design|Domain Driven Design]]`, `[[ryan-singer|Ryan Singer]]` -- Slugs must resolve to an existing file in the inventory, or a file you are creating in this response +- Only link to pages in the inventory or pages you are creating in this response +- The pipeline converts `[[Display Name]]` to `[[slug|Display Name]]` automatically +- Section links must match their section type (Related Concepts → concept pages only, etc.) -Slug generation examples: -- "Domain Driven Design" → `domain-driven-design` -- "It's Complicated" → `its-complicated` -- "gRPC" → `grpc` -- "GPT-4o" → `gpt-4o` +Examples: `[[Domain Driven Design]]`, `[[Ryan Singer]]`, `[[Shape Up]]` ## Domains @@ -30,17 +43,6 @@ Use one of: `ai-llm`, `software-engineering`, `product-strategy`, `finance-marke One page per ingested source. Books are NEVER split across multiple source pages — update the existing one. -Required frontmatter: -```yaml -title: -type: article | pdf | book | video | note | project -domain: -date_ingested: YYYY-MM-DD -last_updated: YYYY-MM-DD -aliases: - - -``` - Body sections (in this order): ### Summary @@ -50,10 +52,10 @@ Body sections (in this order): Bulleted list. Paraphrase — no verbatim quotes or code. ### Concepts Introduced or Reinforced -Wikilinks to wiki/concepts/ ONLY. One per line. +Wikilinks to concept pages ONLY. One per line. ### Entities Mentioned -Wikilinks to wiki/entities/ ONLY. One per line. +Wikilinks to entity pages ONLY. One per line. ### Open Questions Raised Gaps or follow-up questions from this source. @@ -75,15 +77,6 @@ Dated entries appended on re-ingestion. NEVER rewrite — only append. One page per idea, framework, methodology, or pattern. -Required frontmatter: -```yaml -title: -domain: -last_updated: YYYY-MM-DD -aliases: - - -``` - Body sections (in this order): ### Definition @@ -93,13 +86,13 @@ One-paragraph plain-language explanation. Practical significance. Why should anyone care? ### Related Concepts -Wikilinks to wiki/concepts/ ONLY. +Wikilinks to concept pages ONLY. ### Related Entities -Wikilinks to wiki/entities/ ONLY. +Wikilinks to entity pages ONLY. ### Sources -Wikilinks to wiki/sources/ ONLY. +Wikilinks to source pages ONLY. ### Evolving Notes Updated as new sources arrive. Append, do not rewrite. @@ -110,16 +103,6 @@ Updated as new sources arrive. Append, do not rewrite. One page per person, tool, organisation, technology, or product. -Required frontmatter: -```yaml -title: -type: person | company | tool | model | framework | technology -domain: -last_updated: YYYY-MM-DD -aliases: - - -``` - Body sections (in this order): ### Description @@ -132,23 +115,23 @@ Why this entity matters to this knowledge base. With dates where known. ### Related Concepts -Wikilinks to wiki/concepts/ ONLY. +Wikilinks to concept pages ONLY. ### Related Entities -Wikilinks to wiki/entities/ ONLY. +Wikilinks to entity pages ONLY. ### Sources -Wikilinks to wiki/sources/ ONLY. +Wikilinks to source pages ONLY. --- ## Non-Negotiable Rules 1. Output ONLY a valid JSON array — no markdown fences, no prose before or after -2. Each element: `{"path": "wiki//.md", "content": "...full markdown..."}` -3. Slugs are kebab-case: lowercase, spaces→hyphens, strip special characters -4. Every wikilink must be `[[slug|Display Text]]` — the pipe separator is required -5. Dates always YYYY-MM-DD +2. Each element: `{"title": "...", "type": "...", "subtype": "...", "domain": "...", "content": "..."}` +3. Never include slugs, paths, or frontmatter in output — the pipeline handles these +4. Wikilinks: `[[Display Name]]` only — no pipe, no slug +5. Dates always YYYY-MM-DD (used only in content body where contextually relevant) 6. Never reproduce verbatim code — describe the pattern or technique -7. Section links must match their section type (Related Concepts → concepts/ only, etc.) +7. Section links must match their section type 8. One source page per book — if inventory shows it exists, include it as an UPDATE diff --git a/ingestion/internal/pipeline/prompt.go b/ingestion/internal/pipeline/prompt.go index 4ea6c65..00b65c7 100644 --- a/ingestion/internal/pipeline/prompt.go +++ b/ingestion/internal/pipeline/prompt.go @@ -12,12 +12,15 @@ import ( const systemPrompt = `You are a wiki agent. Read the source material and produce structured wiki pages following the schema provided. Output ONLY a valid JSON array — no markdown fences, no other text before or after. -Each element must have: - "path" — relative path within the wiki, e.g. "wiki/sources/foo.md" - "content" — full markdown content of the page including YAML frontmatter +Each element must have exactly these fields: + "title" — exact page title (e.g. "FinBERT", "Ryan Singer", "Shape Up") + "type" — exactly one of: "source", "concept", "entity" + "subtype" — for source: article|pdf|book|video|note|project; for entity: person|company|tool|model|framework|technology; omit for concept + "domain" — one of the domains in the schema (omit if none fits) + "content" — Markdown body only — NO frontmatter, NO path, NO slug -Follow the schema strictly: correct frontmatter fields, wikilinks as [[slug|Display Text]], -dates in YYYY-MM-DD format, and paraphrase rather than quoting verbatim.` +Wikilinks in content: [[Display Name]] — just the display name, no slug, no pipe separator. +Only link to pages listed in the inventory or pages you are creating in this response.` // BuildPrompt constructs the user prompt for a single chunk. func BuildPrompt(schema, source, content string, inventory map[wiki.PageType][]wiki.Entry) string { @@ -30,7 +33,7 @@ func BuildPrompt(schema, source, content string, inventory map[wiki.PageType][]w sb.WriteString("\n\n") sb.WriteString("## Existing wiki pages\n\n") - sb.WriteString("Link ONLY to pages in this inventory or pages you are creating in this response.\n\n") + sb.WriteString("Reference these pages by display name only — [[Display Name]] — in your content.\n\n") for _, pt := range []wiki.PageType{wiki.PageTypeConcept, wiki.PageTypeEntity, wiki.PageTypeSource} { entries := inventory[pt] @@ -39,19 +42,19 @@ func BuildPrompt(schema, source, content string, inventory map[wiki.PageType][]w fmt.Fprintf(&sb, "%s — (none yet)\n\n", label) continue } - fmt.Fprintf(&sb, "%s — link ONLY under the matching section:\n", label) + fmt.Fprintf(&sb, "%s:\n", label) for _, e := range entries { - fmt.Fprintf(&sb, " - [[%s|%s]]\n", e.Slug, e.Title) + fmt.Fprintf(&sb, " - %s\n", e.Title) } sb.WriteString("\n") } sb.WriteString("## Non-negotiable rules\n\n") sb.WriteString("1. Output ONLY a valid JSON array — no prose, no fences.\n") - sb.WriteString("2. Slugs are kebab-case: lowercase, spaces→hyphens, no special chars.\n") - sb.WriteString("3. Wikilinks: [[slug|Display Text]] — the pipe is required.\n") - sb.WriteString("4. Section links must match their section type.\n") - sb.WriteString("5. One source page per book — update it if inventory shows it exists.\n\n") + sb.WriteString("2. Fields: title, type, subtype (if applicable), domain (if applicable), content.\n") + sb.WriteString("3. Wikilinks: [[Display Name]] — no slug, no pipe. The pipeline handles slugs.\n") + sb.WriteString("4. Section links must match their section type (Related Concepts → concepts only, etc.).\n") + sb.WriteString("5. One source page per book — if inventory shows it exists, return it as an UPDATE.\n\n") fmt.Fprintf(&sb, "## Source: %s\n\n", source) sb.WriteString(content)