第2051篇:RAG评估体系——用数据衡量你的知识库质量
大约 6 分钟
第2051篇:RAG评估体系——用数据衡量你的知识库质量
适读人群:需要量化RAG系统效果的工程师和产品负责人 | 阅读时长:约18分钟 | 核心价值:建立完整的RAG评估指标体系,用数据驱动RAG系统优化
"感觉RAG效果还行"——这句话我在项目中听到太多次了。
但"感觉"没办法推动改进,也没办法判断某次参数调整到底有没有效果。要做好RAG,必须有量化指标。
RAG评估的三个层次
检索质量指标
/**
* 检索质量评估
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class RetrievalQualityEvaluator {
/**
* 评估数据集:每条记录包含查询和人工标注的相关文档
*/
public RetrievalMetrics evaluate(
List<RetrievalEvalCase> evalCases,
EmbeddingModel embeddingModel,
EmbeddingStore<TextSegment> vectorStore,
int k) {
List<Double> recallAtK = new ArrayList<>();
List<Double> precisionAtK = new ArrayList<>();
List<Double> mrrs = new ArrayList<>();
for (RetrievalEvalCase evalCase : evalCases) {
// 执行检索
float[] queryEmb = embeddingModel.embed(evalCase.query());
List<String> retrievedDocs = vectorSearch(vectorEmb(queryEmb), vectorStore, k);
// 计算各指标
Set<String> relevant = new HashSet<>(evalCase.relevantDocIds());
// Recall@K:相关文档中有多少被检索到了
long relevantRetrieved = retrievedDocs.stream()
.filter(relevant::contains)
.count();
double recall = relevant.isEmpty() ? 0 :
(double) relevantRetrieved / relevant.size();
recallAtK.add(recall);
// Precision@K:检索到的文档中有多少是相关的
double precision = retrievedDocs.isEmpty() ? 0 :
(double) relevantRetrieved / retrievedDocs.size();
precisionAtK.add(precision);
// MRR(Mean Reciprocal Rank):第一个相关文档排在第几位
double rr = 0;
for (int i = 0; i < retrievedDocs.size(); i++) {
if (relevant.contains(retrievedDocs.get(i))) {
rr = 1.0 / (i + 1);
break;
}
}
mrrs.add(rr);
}
return new RetrievalMetrics(
average(recallAtK),
average(precisionAtK),
average(mrrs),
k
);
}
private double average(List<Double> values) {
return values.stream().mapToDouble(Double::doubleValue).average().orElse(0);
}
@Data @AllArgsConstructor
public static class RetrievalMetrics {
private double recallAtK; // 越高越好(目标:>0.8)
private double precisionAtK; // 越高越好(目标:>0.7)
private double mrr; // 越高越好(目标:>0.7)
private int k;
@Override
public String toString() {
return String.format("Recall@%d=%.3f, Precision@%d=%.3f, MRR=%.3f",
k, recallAtK, k, precisionAtK, mrr);
}
}
public record RetrievalEvalCase(
String query,
List<String> relevantDocIds
) {}
}生成质量指标:RAGAS框架
RAGAS是目前最流行的RAG评估框架,用LLM自动评估生成质量:
/**
* RAGAS风格的生成质量评估(Java实现)
* RAGAS核心指标:忠实性、答案相关性、上下文精确率、上下文召回率
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class RagasStyleEvaluator {
private final ChatLanguageModel evaluatorLlm; // 用强模型评估(GPT-4o)
private final EmbeddingModel embeddingModel;
/**
* 忠实性(Faithfulness)
* 回答中的每个声明是否都有文档支撑?
* 范围:0-1,越高越好
*/
public double evaluateFaithfulness(
String question,
String answer,
List<String> contexts) {
// 第一步:提取答案中的声明
String statementsPrompt = String.format("""
将以下回答分解为独立的原子声明,每行一个声明:
问题:%s
回答:%s
声明(每行一个):
""", question, answer);
String statementsText = evaluatorLlm.generate(statementsPrompt);
List<String> statements = Arrays.stream(statementsText.split("\n"))
.map(String::trim)
.filter(s -> !s.isEmpty())
.collect(Collectors.toList());
if (statements.isEmpty()) return 0;
// 第二步:判断每个声明是否有上下文支撑
String contextText = String.join("\n\n", contexts);
int supportedCount = 0;
for (String statement : statements) {
String verifyPrompt = String.format("""
基于以下文档内容,判断声明是否有文档支撑。
只输出 YES 或 NO。
文档内容:%s
声明:%s
判断(YES/NO):
""", contextText, statement);
String judgment = evaluatorLlm.generate(verifyPrompt).trim().toUpperCase();
if (judgment.startsWith("YES")) {
supportedCount++;
}
}
return (double) supportedCount / statements.size();
}
/**
* 答案相关性(Answer Relevancy)
* 回答是否真正回答了用户的问题?
* 范围:0-1,越高越好
*/
public double evaluateAnswerRelevancy(String question, String answer) {
// 从答案反向生成问题,看生成的问题和原问题是否相似
String reversePrompt = String.format("""
根据以下回答,生成3个可能触发该回答的问题,每行一个:
回答:%s
问题(每行一个):
""", answer);
String generatedQuestionsText = evaluatorLlm.generate(reversePrompt);
List<String> generatedQuestions = Arrays.stream(generatedQuestionsText.split("\n"))
.map(String::trim)
.filter(s -> !s.isEmpty())
.limit(3)
.collect(Collectors.toList());
if (generatedQuestions.isEmpty()) return 0;
// 计算原始问题与生成问题的平均相似度
float[] originalEmb = embeddingModel.embed(question);
double avgSimilarity = generatedQuestions.stream()
.mapToDouble(q -> {
float[] qEmb = embeddingModel.embed(q);
return cosineSimilarity(originalEmb, qEmb);
})
.average()
.orElse(0);
return avgSimilarity;
}
/**
* 上下文精确率(Context Precision)
* 检索到的上下文中有多少是真正有用的?
*/
public double evaluateContextPrecision(
String question,
String groundTruth,
List<String> retrievedContexts) {
int usefulCount = 0;
for (String context : retrievedContexts) {
String verifyPrompt = String.format("""
判断以下上下文对于回答问题是否有用,只输出 YES 或 NO。
问题:%s
参考答案:%s
上下文:%s
是否有用(YES/NO):
""", question, groundTruth, context);
String judgment = evaluatorLlm.generate(verifyPrompt).trim().toUpperCase();
if (judgment.startsWith("YES")) {
usefulCount++;
}
}
return retrievedContexts.isEmpty() ? 0 :
(double) usefulCount / retrievedContexts.size();
}
/**
* 综合评分
*/
public RagasReport evaluate(
String question,
String answer,
List<String> contexts,
String groundTruth) {
double faithfulness = evaluateFaithfulness(question, answer, contexts);
double answerRelevancy = evaluateAnswerRelevancy(question, answer);
double contextPrecision = evaluateContextPrecision(question, groundTruth, contexts);
double overallScore = (faithfulness + answerRelevancy + contextPrecision) / 3;
return new RagasReport(faithfulness, answerRelevancy, contextPrecision, overallScore);
}
private double cosineSimilarity(float[] a, float[] b) {
double dot = 0, na = 0, nb = 0;
for (int i = 0; i < a.length; i++) {
dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i];
}
return dot / (Math.sqrt(na) * Math.sqrt(nb));
}
@Data @AllArgsConstructor
public static class RagasReport {
private double faithfulness; // 忠实性
private double answerRelevancy; // 答案相关性
private double contextPrecision; // 上下文精确率
private double overallScore; // 综合分数
}
}评估数据集的构建
有了评估指标,还需要评估数据集。推荐的方式:
/**
* 自动生成评估数据集
* 用LLM从文档中生成问答对
*/
@Service
@RequiredArgsConstructor
public class EvalDatasetGenerator {
private final ChatLanguageModel llm;
/**
* 从文档片段中生成问答对
*/
public List<QAPair> generateFromDocuments(List<TextSegment> segments, int pairsPerSegment) {
List<QAPair> pairs = new ArrayList<>();
for (TextSegment segment : segments) {
String generationPrompt = String.format("""
基于以下文档内容,生成%d个问答对。
要求:问题应该可以从文档中找到答案,问题类型多样(事实性、解释性、比较性)。
文档内容:
%s
输出格式(每对之间空行):
问:[问题]
答:[答案]
""", pairsPerSegment, segment.text());
String response = llm.generate(generationPrompt);
pairs.addAll(parseQAPairs(response, segment));
}
return pairs;
}
private List<QAPair> parseQAPairs(String response, TextSegment sourceSegment) {
List<QAPair> pairs = new ArrayList<>();
String[] lines = response.split("\n");
String question = null;
for (String line : lines) {
line = line.trim();
if (line.startsWith("问:") || line.startsWith("Q:")) {
question = line.substring(2).trim();
} else if ((line.startsWith("答:") || line.startsWith("A:")) && question != null) {
String answer = line.substring(2).trim();
pairs.add(new QAPair(question, answer, sourceSegment.text()));
question = null;
}
}
return pairs;
}
public record QAPair(String question, String groundTruth, String sourceDocument) {}
}建立评估流水线
/**
* 完整的RAG评估流水线
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class RagEvaluationPipeline {
private final RetrievalQualityEvaluator retrievalEvaluator;
private final RagasStyleEvaluator generationEvaluator;
private final EmbeddingModel embeddingModel;
private final EmbeddingStore<TextSegment> vectorStore;
private final ChatLanguageModel chatModel;
public RagSystemReport runFullEvaluation(List<EvalDatasetGenerator.QAPair> evalDataset) {
log.info("开始RAG系统全面评估,测试集大小: {}", evalDataset.size());
List<Double> faithfulnessScores = new ArrayList<>();
List<Double> answerRelevancyScores = new ArrayList<>();
List<Double> contextPrecisionScores = new ArrayList<>();
int processedCount = 0;
for (EvalDatasetGenerator.QAPair pair : evalDataset) {
try {
// 执行RAG检索
float[] queryEmb = embeddingModel.embed(pair.question());
EmbeddingSearchRequest searchReq = EmbeddingSearchRequest.builder()
.queryEmbedding(Embedding.from(queryEmb))
.maxResults(5)
.minScore(0.6)
.build();
List<String> contexts = vectorStore.search(searchReq).matches().stream()
.map(m -> m.embedded().text())
.collect(Collectors.toList());
// 生成回答
String contextText = String.join("\n\n", contexts);
String prompt = String.format("基于以下内容回答问题:\n%s\n\n问题:%s",
contextText, pair.question());
String answer = chatModel.generate(prompt);
// 评估各指标
RagasStyleEvaluator.RagasReport report = generationEvaluator.evaluate(
pair.question(), answer, contexts, pair.groundTruth());
faithfulnessScores.add(report.getFaithfulness());
answerRelevancyScores.add(report.getAnswerRelevancy());
contextPrecisionScores.add(report.getContextPrecision());
processedCount++;
if (processedCount % 10 == 0) {
log.info("评估进度: {}/{}", processedCount, evalDataset.size());
}
} catch (Exception e) {
log.warn("评估案例失败: {}", pair.question(), e);
}
}
return new RagSystemReport(
average(faithfulnessScores),
average(answerRelevancyScores),
average(contextPrecisionScores),
evalDataset.size()
);
}
private double average(List<Double> values) {
return values.stream().mapToDouble(Double::doubleValue).average().orElse(0);
}
@Data @AllArgsConstructor
public static class RagSystemReport {
private double avgFaithfulness;
private double avgAnswerRelevancy;
private double avgContextPrecision;
private int evalSetSize;
public void print() {
System.out.printf("""
=== RAG系统评估报告 ===
测试集大小: %d 条
忠实性: %.3f (目标 > 0.85)
答案相关性: %.3f (目标 > 0.80)
上下文精确率: %.3f (目标 > 0.75)
综合得分: %.3f
""",
evalSetSize,
avgFaithfulness,
avgAnswerRelevancy,
avgContextPrecision,
(avgFaithfulness + avgAnswerRelevancy + avgContextPrecision) / 3
);
}
}
}建立评估体系的关键步骤:先有测试集,再有指标,再持续跑评估对比不同配置的效果。没有评估,所有的"优化"都是在猜。
