feat(ingestion): add content chunking and LLM JSON output parser
This commit is contained in:
39
ingestion/internal/pipeline/chunk.go
Normal file
39
ingestion/internal/pipeline/chunk.go
Normal file
@@ -0,0 +1,39 @@
|
||||
// ingestion/internal/pipeline/chunk.go
|
||||
package pipeline
|
||||
|
||||
import "strings"
|
||||
|
||||
// Chunk splits content into pieces of at most maxSize bytes, splitting at
|
||||
// paragraph boundaries (\n\n). If maxSize <= 0, returns content as one chunk.
|
||||
func Chunk(content string, maxSize int) []string {
|
||||
content = strings.TrimSpace(content)
|
||||
if maxSize <= 0 || len(content) <= maxSize {
|
||||
return []string{content}
|
||||
}
|
||||
|
||||
paragraphs := strings.Split(content, "\n\n")
|
||||
var chunks []string
|
||||
var cur strings.Builder
|
||||
|
||||
for _, para := range paragraphs {
|
||||
para = strings.TrimSpace(para)
|
||||
if para == "" {
|
||||
continue
|
||||
}
|
||||
addition := para
|
||||
if cur.Len() > 0 {
|
||||
addition = "\n\n" + para
|
||||
}
|
||||
if cur.Len() > 0 && cur.Len()+len(addition) > maxSize {
|
||||
chunks = append(chunks, cur.String())
|
||||
cur.Reset()
|
||||
cur.WriteString(para)
|
||||
} else {
|
||||
cur.WriteString(addition)
|
||||
}
|
||||
}
|
||||
if cur.Len() > 0 {
|
||||
chunks = append(chunks, cur.String())
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
Reference in New Issue
Block a user