第2162篇:评估驱动的Prompt迭代——把感性调优变成有数据支撑的工程流程
2026/4/30大约 7 分钟
第2162篇:评估驱动的Prompt迭代——把感性调优变成有数据支撑的工程流程
适读人群:经常做Prompt优化的AI工程师 | 阅读时长:约17分钟 | 核心价值:建立数据驱动的Prompt迭代工程流程,告别"调了一下午感觉还不如之前"的窘境
Prompt工程有一个很奇怪的现象:很多人改了一天Prompt,最后发现改来改去效果没什么变化,甚至不如最初版本。
根本原因是:没有量化评估,改动是盲目的。你不知道哪一步改进了,哪一步退步了,只能靠主观感觉在几个版本里选,而主观感觉在复杂场景下极其不可靠。
评估驱动的Prompt迭代(Evaluation-Driven Prompt Engineering)就是把这个过程变成工程:每次改动都有测试集验证,有数据告诉你改进了多少、在哪个场景改进了、有没有其他场景变差了。
Prompt迭代的典型错误
在讲正确方法前,先列一下常见的错误模式:
错误1:基于单条样本调优
→ 改Prompt让某条样本变好了,但导致其他样本变差
→ 正确做法:在测试集(50条以上)上验证
错误2:不记录版本
→ 改了10版,不记得哪版最好,无法回滚
→ 正确做法:每个版本存档,有唯一标识
错误3:一次改多个地方
→ 同时改了System Prompt和Few-shot示例,不知道是哪个起效果
→ 正确做法:控制变量,一次只改一处
错误4:评估维度单一
→ 只看整体分数,不知道在哪个场景/维度改善了
→ 正确做法:分场景、分维度查看
错误5:忽视回归
→ 新版在目标场景改善了,但在其他场景悄悄变差
→ 正确做法:全测试集跑回归测试评估驱动迭代的完整流程
实现评估驱动迭代的工具
/**
* Prompt迭代工作台
*
* 支持Prompt版本管理、快速评估和对比分析
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class PromptIterationWorkbench {
private final PromptVersionRepository versionRepository;
private final LlmEvaluationService evaluationService;
private final TestDatasetService testDatasetService;
/**
* 快速评估一个Prompt版本
*
* 设计目标:5分钟内得到可信的评估结果
*/
public IterationEvalResult quickEvaluate(String promptContent,
String datasetName,
EvalConfig config) {
List<TestCase> dataset = testDatasetService.loadDataset(datasetName);
// 快速评估用小样本(50条)
List<TestCase> evalSample = stratifiedSample(dataset,
Math.min(50, dataset.size()), config.getStratifyBy());
String runId = "quick-eval-" + System.currentTimeMillis();
List<EvaluationReport> reports = new ArrayList<>();
for (TestCase tc : evalSample) {
String output = callLlmWithPrompt(promptContent, tc.getQuestion(), config);
EvaluationReport report = evaluationService.evaluate(
EvaluationRequest.builder()
.userInput(tc.getQuestion())
.llmOutput(output)
.context(tc.getContext())
.build()
);
reports.add(report);
}
return buildIterationResult(runId, promptContent, reports, evalSample);
}
/**
* 完整评估(用于最终决策)
*
* 用完整测试集,结果更可信
*/
public IterationEvalResult fullEvaluate(String promptContent,
String datasetName,
EvalConfig config) {
List<TestCase> dataset = testDatasetService.loadDataset(datasetName);
String runId = "full-eval-" + System.currentTimeMillis();
List<EvaluationReport> reports = new ArrayList<>();
for (TestCase tc : dataset) {
String output = callLlmWithPrompt(promptContent, tc.getQuestion(), config);
EvaluationReport report = evaluationService.evaluate(
EvaluationRequest.builder()
.requestId(tc.getId())
.userInput(tc.getQuestion())
.llmOutput(output)
.context(tc.getContext())
.intentLabel(tc.getIntentLabel())
.build()
);
reports.add(report);
}
return buildIterationResult(runId, promptContent, reports, dataset);
}
/**
* 对比两个版本的评估结果
*
* 重点:不只看整体,要看每个场景(意图)的变化
*/
public VersionComparisonReport compareVersions(IterationEvalResult baseline,
IterationEvalResult candidate) {
// 整体对比
double overallDelta = candidate.getAvgScore() - baseline.getAvgScore();
double passRateDelta = candidate.getPassRate() - baseline.getPassRate();
// 分意图对比
Map<String, ScoreDelta> intentDeltas = new HashMap<>();
Set<String> intents = new HashSet<>();
intents.addAll(baseline.getIntentScores().keySet());
intents.addAll(candidate.getIntentScores().keySet());
for (String intent : intents) {
double baselineScore = baseline.getIntentScores().getOrDefault(intent, 0.0);
double candidateScore = candidate.getIntentScores().getOrDefault(intent, 0.0);
double delta = candidateScore - baselineScore;
intentDeltas.put(intent, ScoreDelta.builder()
.baseline(baselineScore)
.candidate(candidateScore)
.delta(delta)
.improved(delta > 0.02)
.degraded(delta < -0.02)
.build());
}
// 识别退步的场景(回归问题)
List<String> regressions = intentDeltas.entrySet().stream()
.filter(e -> e.getValue().isDegraded())
.sorted(Comparator.comparingDouble(e -> e.getValue().getDelta()))
.map(Map.Entry::getKey)
.collect(Collectors.toList());
// 识别改善的场景
List<String> improvements = intentDeltas.entrySet().stream()
.filter(e -> e.getValue().isImproved())
.sorted((a, b) -> Double.compare(b.getValue().getDelta(), a.getValue().getDelta()))
.map(Map.Entry::getKey)
.collect(Collectors.toList());
// 分析分维度变化
Map<String, Double> dimensionDeltas = computeDimensionDeltas(baseline, candidate);
return VersionComparisonReport.builder()
.baselineRunId(baseline.getRunId())
.candidateRunId(candidate.getRunId())
.overallScoreDelta(overallDelta)
.passRateDelta(passRateDelta)
.intentDeltas(intentDeltas)
.regressions(regressions)
.improvements(improvements)
.dimensionDeltas(dimensionDeltas)
.recommendation(buildRecommendation(overallDelta, passRateDelta, regressions, improvements))
.build();
}
/**
* 分析失败样本,找出改进方向
*
* 这是"找到下一步改什么"的关键步骤
*/
public FailureAnalysisReport analyzeFailures(IterationEvalResult evalResult) {
List<EvaluationReport> failedReports = evalResult.getReports().stream()
.filter(r -> !r.isPassed())
.collect(Collectors.toList());
// 按失败原因分类
Map<String, List<EvaluationReport>> byFailureType = failedReports.stream()
.collect(Collectors.groupingBy(this::classifyFailure));
// 找出最常见的失败模式
List<FailurePattern> patterns = byFailureType.entrySet().stream()
.sorted((a, b) -> b.getValue().size() - a.getValue().size())
.map(e -> FailurePattern.builder()
.failureType(e.getKey())
.count(e.getValue().size())
.percentage((double) e.getValue().size() / failedReports.size())
.examples(e.getValue().subList(0, Math.min(3, e.getValue().size())))
.build())
.collect(Collectors.toList());
// 按意图分类的失败率
Map<String, Double> failureRateByIntent = evalResult.getReports().stream()
.collect(Collectors.groupingBy(
r -> r.getIntentLabel() != null ? r.getIntentLabel() : "unknown",
Collectors.averagingDouble(r -> r.isPassed() ? 0.0 : 1.0)
));
// 找出失败率最高的意图(优先改进这些场景)
List<String> worstIntents = failureRateByIntent.entrySet().stream()
.sorted((a, b) -> Double.compare(b.getValue(), a.getValue()))
.limit(5)
.map(Map.Entry::getKey)
.collect(Collectors.toList());
return FailureAnalysisReport.builder()
.totalFailed(failedReports.size())
.totalEvaluated(evalResult.getReports().size())
.failureRate((double) failedReports.size() / evalResult.getReports().size())
.failurePatterns(patterns)
.failureRateByIntent(failureRateByIntent)
.worstIntents(worstIntents)
.improvementSuggestions(generateImprovementSuggestions(patterns, worstIntents))
.build();
}
private String classifyFailure(EvaluationReport report) {
// 根据评估维度的分数判断主要失败原因
Map<String, DimensionScore> dims = report.getDimensionScores();
if (dims == null) return "unknown";
String worstDimension = dims.entrySet().stream()
.min(Comparator.comparingDouble(e -> e.getValue().getScore()))
.map(Map.Entry::getKey)
.orElse("unknown");
return switch (worstDimension) {
case "accuracy" -> "事实错误/幻觉";
case "relevance" -> "跑题/不切题";
case "completeness" -> "回答不完整";
case "safety" -> "安全/合规问题";
case "format" -> "格式不符合规范";
default -> "综合质量不足";
};
}
private List<String> generateImprovementSuggestions(List<FailurePattern> patterns,
List<String> worstIntents) {
List<String> suggestions = new ArrayList<>();
patterns.stream().limit(3).forEach(p -> {
switch (p.getFailureType()) {
case "事实错误/幻觉" -> suggestions.add(
"在System Prompt中强化接地气指令:'只根据提供的资料回答,不要添加资料中没有的信息'");
case "跑题/不切题" -> suggestions.add(
"在System Prompt末尾添加:'回答要紧扣用户问题,不要展开不相关的信息'");
case "回答不完整" -> suggestions.add(
"增加Few-shot示例,示例中的回答要覆盖完整的要点");
case "格式不符合规范" -> suggestions.add(
"在System Prompt中明确格式要求,并提供格式模板");
}
});
if (!worstIntents.isEmpty()) {
suggestions.add("最需要改进的场景:" + String.join("、", worstIntents.subList(0, Math.min(3, worstIntents.size()))) +
",建议针对这些场景增加Few-shot示例");
}
return suggestions;
}
private IterationEvalResult buildIterationResult(String runId, String promptContent,
List<EvaluationReport> reports,
List<TestCase> testCases) {
double avgScore = reports.stream().mapToDouble(EvaluationReport::getOverallScore).average().orElse(0);
double passRate = reports.stream().mapToDouble(r -> r.isPassed() ? 1.0 : 0.0).average().orElse(0);
// 按意图分组计算分数
Map<String, Double> intentScores = new HashMap<>();
for (int i = 0; i < reports.size() && i < testCases.size(); i++) {
String intent = testCases.get(i).getIntentLabel() != null
? testCases.get(i).getIntentLabel() : "unknown";
intentScores.merge(intent, reports.get(i).getOverallScore(), (a, b) -> (a + b) / 2);
}
return IterationEvalResult.builder()
.runId(runId)
.promptContent(promptContent)
.sampleCount(reports.size())
.avgScore(avgScore)
.passRate(passRate)
.intentScores(intentScores)
.reports(reports)
.evaluatedAt(Instant.now())
.build();
}
private Map<String, Double> computeDimensionDeltas(IterationEvalResult baseline,
IterationEvalResult candidate) {
Map<String, Double> baselineAvgs = computeDimensionAverages(baseline.getReports());
Map<String, Double> candidateAvgs = computeDimensionAverages(candidate.getReports());
Map<String, Double> deltas = new HashMap<>();
baselineAvgs.forEach((dim, baseScore) -> {
double candidateScore = candidateAvgs.getOrDefault(dim, baseScore);
deltas.put(dim, candidateScore - baseScore);
});
return deltas;
}
private Map<String, Double> computeDimensionAverages(List<EvaluationReport> reports) {
Map<String, List<Double>> byDimension = new HashMap<>();
reports.forEach(r -> {
if (r.getDimensionScores() != null) {
r.getDimensionScores().forEach((dim, score) ->
byDimension.computeIfAbsent(dim, k -> new ArrayList<>()).add(score.getScore()));
}
});
Map<String, Double> avgs = new HashMap<>();
byDimension.forEach((dim, scores) ->
avgs.put(dim, scores.stream().mapToDouble(Double::doubleValue).average().orElse(0)));
return avgs;
}
private String buildRecommendation(double scoreDelta, double passRateDelta,
List<String> regressions, List<String> improvements) {
if (scoreDelta > 0.05 && regressions.isEmpty()) {
return "强烈推荐:综合分提升" + String.format("+%.1f%%", scoreDelta * 100) + ",无回归风险";
} else if (scoreDelta > 0.02 && regressions.size() <= 2) {
return "谨慎推荐:综合分有提升,但以下场景有轻微退步:" + String.join("、", regressions);
} else if (scoreDelta < -0.03) {
return "不推荐:综合分下降" + String.format("%.1f%%", Math.abs(scoreDelta * 100));
} else {
return "中性:改善和退步相抵,建议继续优化后再决策";
}
}
private List<TestCase> stratifiedSample(List<TestCase> dataset, int size, String stratifyBy) {
if ("intent".equals(stratifyBy)) {
Map<String, List<TestCase>> byIntent = dataset.stream()
.collect(Collectors.groupingBy(tc -> tc.getIntentLabel() != null ? tc.getIntentLabel() : "unknown"));
List<TestCase> result = new ArrayList<>();
byIntent.values().forEach(group -> {
List<TestCase> shuffled = new ArrayList<>(group);
Collections.shuffle(shuffled);
result.addAll(shuffled.subList(0, Math.min(size / byIntent.size(), shuffled.size())));
});
return result;
}
List<TestCase> shuffled = new ArrayList<>(dataset);
Collections.shuffle(shuffled, new Random(42));
return shuffled.subList(0, Math.min(size, shuffled.size()));
}
private String callLlmWithPrompt(String promptContent, String question, EvalConfig config) {
// 调用LLM的具体实现
return ""; // placeholder
}
}实战案例:一次典型的Prompt迭代
问题:RAG客服系统的准确性分数只有0.68,目标是0.80+。
第一步:跑失败分析,发现40%的失败是"跑题"——模型回答了与问题相关但不直接的信息。
假设:System Prompt里没有明确要求"直接回答用户的问题"。
修改:在System Prompt末尾加一行:"请直接回答用户的问题,不要先介绍背景,不要展开与问题无直接关系的内容。"
评估结果:快速评估(50条样本),准确性从0.68提升到0.74,其他维度没有退步。
第二步:继续失败分析,发现剩余失败主要是"幻觉"——引用了文档里没有的信息。
修改:强化接地气指令,增加了明确的示例。
结果:完整评估(200条),准确性0.74→0.82,超过目标。
整个过程3天,5个Prompt版本,每个版本都有数据支撑。这就是评估驱动迭代的价值。
