第2074篇:RAG系统的分块策略——如何切分文档才能提高检索精度
大约 8 分钟
第2074篇:RAG系统的分块策略——如何切分文档才能提高检索精度
适读人群:正在优化RAG系统效果的工程师 | 阅读时长:约19分钟 | 核心价值:掌握文档分块的各种策略,理解不同场景下的最优分块方案,提升RAG检索质量
RAG系统的质量取决于三件事:好的分块、好的向量模型、好的检索策略。
很多工程师把大量时间花在调模型参数上,却忽视了分块策略——而实际上分块方式对检索质量的影响往往比换模型更大。
这篇文章专门讲文档分块,把我踩过的坑和找到的最佳实践分享出来。
分块的核心问题
好的分块需要满足:
- 语义完整性:每个chunk包含完整的语义单位,不在句子中间断开
- 适当的粒度:太大→检索精度低;太小→缺少上下文
- 关键信息保留:重要的实体、数字、关系不被分割
- 检索友好:chunk的内容要能匹配用户的查询意图
基础分块策略
/**
* 文档分块器
* 实现多种分块策略,按文档类型选择
*/
@Service
@Slf4j
public class DocumentChunker {
/**
* 策略1:固定大小分块(最简单,但效果最差)
* 只推荐用于纯数字/代码内容,对自然语言效果差
*/
public List<TextChunk> fixedSizeChunking(String text, int chunkSize, int overlap) {
List<TextChunk> chunks = new ArrayList<>();
int start = 0;
while (start < text.length()) {
int end = Math.min(start + chunkSize, text.length());
chunks.add(new TextChunk(text.substring(start, end), start, end));
start = end - overlap; // 重叠区域保证连续性
}
return chunks;
}
/**
* 策略2:句子级分块(自然语言文本的基础策略)
* 按句子边界分割,保证语义完整性
*/
public List<TextChunk> sentenceChunking(String text, int maxChunkSize) {
// 中文句子边界:。!?;
String[] sentences = text.split("(?<=[。!?;])");
List<TextChunk> chunks = new ArrayList<>();
StringBuilder current = new StringBuilder();
int currentStart = 0;
int posTracker = 0;
for (String sentence : sentences) {
if (current.length() + sentence.length() > maxChunkSize && current.length() > 0) {
chunks.add(new TextChunk(current.toString().trim(),
currentStart, currentStart + current.length()));
currentStart = posTracker;
current = new StringBuilder();
}
current.append(sentence);
posTracker += sentence.length();
}
if (current.length() > 0) {
chunks.add(new TextChunk(current.toString().trim(),
currentStart, currentStart + current.length()));
}
return chunks;
}
/**
* 策略3:段落级分块(推荐用于大多数场景)
* 按段落分割,一段是一个语义单位
*/
public List<TextChunk> paragraphChunking(String text, int maxChunkSize) {
String[] paragraphs = text.split("\n\n+"); // 两个以上空行作为段落分隔
List<TextChunk> chunks = new ArrayList<>();
StringBuilder current = new StringBuilder();
for (String paragraph : paragraphs) {
paragraph = paragraph.trim();
if (paragraph.isEmpty()) continue;
// 单个段落超过最大大小,需要进一步分割
if (paragraph.length() > maxChunkSize) {
if (current.length() > 0) {
chunks.add(new TextChunk(current.toString(), 0, 0));
current = new StringBuilder();
}
// 对超大段落用句子级分割
chunks.addAll(sentenceChunking(paragraph, maxChunkSize));
} else if (current.length() + paragraph.length() > maxChunkSize) {
chunks.add(new TextChunk(current.toString(), 0, 0));
current = new StringBuilder(paragraph);
} else {
if (current.length() > 0) current.append("\n\n");
current.append(paragraph);
}
}
if (current.length() > 0) {
chunks.add(new TextChunk(current.toString(), 0, 0));
}
return chunks;
}
public record TextChunk(String text, int startPos, int endPos) {}
}高级分块策略
/**
* 结构感知分块(Structure-Aware Chunking)
* 利用文档的结构信息(标题、列表、表格)进行分块
* 比纯文本分块质量高很多
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class StructureAwareChunker {
/**
* Markdown文档分块
* 按标题层级分块,保持章节完整性
*/
public List<DocumentChunk> chunkMarkdown(String markdown, int maxTokens) {
List<DocumentChunk> chunks = new ArrayList<>();
// 按H1/H2标题分割
String[] sections = markdown.split("(?=\n#{1,2} )");
for (String section : sections) {
if (section.trim().isEmpty()) continue;
// 提取标题
String title = extractHeading(section);
if (estimateTokens(section) <= maxTokens) {
// 整个章节作为一个chunk
chunks.add(DocumentChunk.builder()
.content(section.trim())
.title(title)
.chunkType("section")
.build());
} else {
// 章节太大,进一步按子标题分割
String[] subsections = section.split("(?=\n#{3,4} )");
for (String subsection : subsections) {
if (!subsection.trim().isEmpty()) {
chunks.add(DocumentChunk.builder()
.content(subsection.trim())
.title(title + " - " + extractHeading(subsection))
.chunkType("subsection")
.build());
}
}
}
}
return chunks;
}
/**
* 代码文档分块
* 代码和说明文字分开处理
*/
public List<DocumentChunk> chunkCodeDocument(String content) {
List<DocumentChunk> chunks = new ArrayList<>();
// 提取代码块
Pattern codePattern = Pattern.compile("```(\\w+)?\\n([\\s\\S]+?)```");
Matcher matcher = codePattern.matcher(content);
int lastEnd = 0;
while (matcher.find()) {
// 代码块前的文字
String textBefore = content.substring(lastEnd, matcher.start()).trim();
if (!textBefore.isEmpty()) {
chunks.add(DocumentChunk.builder()
.content(textBefore)
.chunkType("text")
.build());
}
// 代码块本身
String language = matcher.group(1) != null ? matcher.group(1) : "code";
String code = matcher.group(2);
chunks.add(DocumentChunk.builder()
.content(code.trim())
.chunkType("code")
.metadata(Map.of("language", language))
.build());
lastEnd = matcher.end();
}
// 代码块后的剩余文字
String remaining = content.substring(lastEnd).trim();
if (!remaining.isEmpty()) {
chunks.add(DocumentChunk.builder()
.content(remaining)
.chunkType("text")
.build());
}
return chunks;
}
/**
* 表格分块
* 把表格的头信息复制到每一行chunk中,避免语义丢失
*/
public List<DocumentChunk> chunkTable(String tableContent) {
String[] lines = tableContent.split("\n");
if (lines.length < 2) return List.of(DocumentChunk.builder()
.content(tableContent).chunkType("table").build());
String header = lines[0]; // 表头行
List<DocumentChunk> chunks = new ArrayList<>();
// 每几行作为一个chunk,但都带上表头
int rowsPerChunk = 5; // 每个chunk包含5行数据
for (int i = 2; i < lines.length; i += rowsPerChunk) { // i=2跳过分隔行
int end = Math.min(i + rowsPerChunk, lines.length);
StringBuilder chunk = new StringBuilder();
chunk.append(header).append("\n");
chunk.append("|---|---|---|---|\n"); // 分隔行
for (int j = i; j < end; j++) {
chunk.append(lines[j]).append("\n");
}
chunks.add(DocumentChunk.builder()
.content(chunk.toString().trim())
.chunkType("table_rows")
.metadata(Map.of("tableHeader", header))
.build());
}
return chunks;
}
private String extractHeading(String text) {
Pattern headingPattern = Pattern.compile("^#{1,6} (.+)$", Pattern.MULTILINE);
Matcher m = headingPattern.matcher(text);
return m.find() ? m.group(1) : "";
}
private int estimateTokens(String text) {
// 中文:约1.5字/token;英文:约4字符/token
// 粗略估计
return text.length() / 2;
}
@Data @Builder
public static class DocumentChunk {
private String content;
private String title;
private String chunkType; // text/code/table/section/subsection
private Map<String, String> metadata;
}
}父子分块(Parent-Child Chunking)
这是提升RAG精度最有效的技术之一:
/**
* 父子分块策略
*
* 原理:
* - 小chunk用于检索(更精准匹配查询)
* - 大chunk用于生成(提供更多上下文给LLM)
*
* 效果:
* - 检索精度:接近小chunk
* - 生成质量:接近大chunk
* 这是两全其美的方案
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class ParentChildChunkingService {
private final EmbeddingModel embeddingModel;
private final EmbeddingStore<TextSegment> vectorStore;
private final Map<String, String> parentChunkStore; // parentId → 完整父chunk内容
/**
* 构建父子分块索引
*/
public void indexDocument(String documentId, String documentContent) {
// 1. 大块切分(父chunk,约1000字)
List<String> parentChunks = splitIntoParentChunks(documentContent, 1000);
for (int i = 0; i < parentChunks.size(); i++) {
String parentId = documentId + "_parent_" + i;
String parentContent = parentChunks.get(i);
// 存储父chunk(全文)
parentChunkStore.put(parentId, parentContent);
// 2. 每个父chunk进一步切成小块(子chunk,约200字)
List<String> childChunks = splitIntoChildChunks(parentContent, 200);
for (int j = 0; j < childChunks.size(); j++) {
String childId = parentId + "_child_" + j;
// 子chunk加入向量索引
// 关键:元数据中存储父chunk的ID
TextSegment segment = TextSegment.from(
childChunks.get(j),
Metadata.from(Map.of(
"parentId", parentId,
"childId", childId,
"documentId", documentId,
"chunkIndex", String.valueOf(j)
))
);
Embedding embedding = embeddingModel.embed(segment.text());
vectorStore.add(embedding, segment);
}
}
log.info("文档索引完成: {}, {}个父chunk", documentId, parentChunks.size());
}
/**
* 父子分块检索
* 1. 检索小chunk(精准匹配)
* 2. 根据小chunk找到父chunk(丰富上下文)
* 3. 返回父chunk内容给LLM
*/
public List<RetrievedContext> retrieve(String query, int topK) {
// 1. 向量检索小chunk
float[] queryEmbedding = embeddingModel.embed(query);
EmbeddingSearchRequest searchRequest = EmbeddingSearchRequest.builder()
.queryEmbedding(Embedding.from(queryEmbedding))
.maxResults(topK * 2) // 多检索一些,因为可能映射到同一父chunk
.minScore(0.7)
.build();
List<EmbeddingMatch<TextSegment>> matches =
vectorStore.search(searchRequest).matches();
// 2. 通过子chunk找父chunk,去重
Set<String> usedParentIds = new LinkedHashSet<>();
List<RetrievedContext> contexts = new ArrayList<>();
for (EmbeddingMatch<TextSegment> match : matches) {
String parentId = match.embedded().metadata().getString("parentId");
if (parentId != null && !usedParentIds.contains(parentId)) {
usedParentIds.add(parentId);
// 获取父chunk的完整内容
String parentContent = parentChunkStore.get(parentId);
if (parentContent != null) {
contexts.add(new RetrievedContext(
parentContent, // 完整的父chunk给LLM
match.embedded().text(), // 命中的子chunk(用于调试)
match.score(),
parentId
));
}
}
if (contexts.size() >= topK) break;
}
return contexts;
}
private List<String> splitIntoParentChunks(String text, int targetSize) {
// 按段落分割成父chunk
String[] paragraphs = text.split("\n\n+");
List<String> chunks = new ArrayList<>();
StringBuilder current = new StringBuilder();
for (String para : paragraphs) {
if (current.length() + para.length() > targetSize && current.length() > 0) {
chunks.add(current.toString().trim());
current = new StringBuilder();
}
current.append(para).append("\n\n");
}
if (current.length() > 0) chunks.add(current.toString().trim());
return chunks;
}
private List<String> splitIntoChildChunks(String parentChunk, int targetSize) {
// 按句子分割成子chunk
String[] sentences = parentChunk.split("(?<=[。!?;])");
List<String> chunks = new ArrayList<>();
StringBuilder current = new StringBuilder();
for (String sentence : sentences) {
if (current.length() + sentence.length() > targetSize && current.length() > 0) {
chunks.add(current.toString().trim());
current = new StringBuilder();
}
current.append(sentence);
}
if (current.length() > 0) chunks.add(current.toString().trim());
return chunks;
}
public record RetrievedContext(
String parentContent, // 完整的父chunk
String matchedChild, // 命中的子chunk
double score,
String parentId
) {}
}语义分块(最先进的方法)
/**
* 基于语义相似度的分块
* 当两个相邻句子语义差距大时,在这里切割
* 效果最好,但计算量也最大
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class SemanticChunker {
private final EmbeddingModel embeddingModel;
/**
* 语义感知分块
* @param breakpointThreshold 余弦距离阈值,超过此值就分割(0.1-0.3)
*/
public List<String> semanticChunking(String text, double breakpointThreshold) {
// 1. 切成句子
String[] sentences = text.split("(?<=[。!?])");
if (sentences.length <= 1) return List.of(text);
// 2. 计算相邻句子的语义相似度
List<float[]> embeddings = new ArrayList<>();
for (String sentence : sentences) {
embeddings.add(embeddingModel.embed(sentence.trim()));
}
// 3. 找到语义跳变点(相邻句子相似度低的位置)
List<Integer> breakpoints = new ArrayList<>();
for (int i = 0; i < embeddings.size() - 1; i++) {
double similarity = cosineSimilarity(embeddings.get(i), embeddings.get(i + 1));
double distance = 1.0 - similarity;
if (distance > breakpointThreshold) {
breakpoints.add(i);
log.debug("发现语义分割点: 句子{}/{}之间,距离={:.3f}", i, i+1, distance);
}
}
// 4. 按分割点切分
List<String> chunks = new ArrayList<>();
int start = 0;
for (int breakpoint : breakpoints) {
StringBuilder chunk = new StringBuilder();
for (int i = start; i <= breakpoint; i++) {
chunk.append(sentences[i]).append(" ");
}
chunks.add(chunk.toString().trim());
start = breakpoint + 1;
}
// 最后一段
StringBuilder lastChunk = new StringBuilder();
for (int i = start; i < sentences.length; i++) {
lastChunk.append(sentences[i]).append(" ");
}
if (lastChunk.length() > 0) chunks.add(lastChunk.toString().trim());
log.info("语义分块完成: {}句子 → {}个chunk", sentences.length, chunks.size());
return chunks;
}
private double cosineSimilarity(float[] a, float[] b) {
double dot = 0, normA = 0, normB = 0;
for (int i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}
}分块策略选择指南
| 文档类型 | 推荐策略 | 参数参考 |
|---|---|---|
| 普通文章/博客 | 段落分块 + 父子分块 | 父=1000字,子=200字 |
| 技术文档/手册 | 结构感知分块(按标题) | 章节完整性优先 |
| 法律合同 | 条款级分块(按编号) | 每条独立成chunk |
| 代码文档 | 代码和文字分开 | 函数/类为单位 |
| 新闻/报告 | 语义分块 | 阈值0.2-0.3 |
| 表格数据 | 行级分块 + 带表头 | 每5-10行一块 |
分块是RAG的基础,但没有万能的策略。最好的办法是:针对你的具体文档类型,设计分块方式,然后用RAGAS或类似框架评估召回率,按指标优化。
