第2039篇:Prompt测试工程——让Prompt改动有量化依据
2026/4/30大约 5 分钟
第2039篇:Prompt测试工程——让Prompt改动有量化依据
适读人群:需要系统化评估Prompt改动效果的工程师 | 阅读时长:约17分钟 | 核心价值:建立Prompt的自动化测试体系,让每次改动都有数据支撑
我们有一次Prompt改动,产品说"这个版本更好",工程师说"我觉得差不多",用户反馈说"感觉反而差了"。
三个人的判断完全不同。到底谁对?没有量化数据,这个争论没有答案。
Prompt测试工程就是解决这个问题的:每次Prompt改动,都要有测试数据说话。
测试用例设计
和代码测试一样,好的Prompt测试需要覆盖以下类型:
/**
* Prompt测试用例格式
*/
@Data
@Builder
public class PromptTestCase {
private String caseId;
private String description; // 测试什么
private TestCaseType type; // POSITIVE / EDGE / REGRESSION / STRESS
private String input; // 用户输入
private String systemContext; // 额外的上下文(可选)
// 期望的输出(多种可接受的形式)
private String exactExpected; // 精确匹配
private List<String> mustContain; // 必须包含的关键词
private List<String> mustNotContain; // 不能包含的内容
private EvaluationCriteria criteria; // 更复杂的评估标准
// 优先级(用于快速测试只跑高优先级)
private int priority; // 1最高
}
public enum EvaluationCriteria {
EXACT_MATCH, // 精确匹配
CONTAINS_KEYWORDS, // 包含关键词
SEMANTIC_SIMILARITY, // 语义相似度
LLM_AS_JUDGE, // 另一个LLM评判
CUSTOM_FUNCTION // 自定义评估函数
}自动化测试执行器
/**
* Prompt测试执行器
* 自动化运行测试用例,生成测试报告
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class PromptTestRunner {
private final ChatClient testSubject; // 被测试的模型
private final ChatClient judgeModel; // 用于LLM-as-Judge评估
private final EmbeddingModel embeddingModel;
/**
* 执行测试套件,返回完整报告
*/
public TestSuiteReport runTestSuite(
String systemPrompt,
List<PromptTestCase> testCases) {
log.info("开始执行Prompt测试套件,共{}个测试用例", testCases.size());
List<TestCaseResult> results = testCases.parallelStream()
.map(tc -> runSingleTest(systemPrompt, tc))
.collect(Collectors.toList());
long passed = results.stream().filter(TestCaseResult::isPassed).count();
long failed = results.size() - passed;
// 按类型统计通过率
Map<TestCaseType, Double> passRateByType = results.stream()
.collect(Collectors.groupingBy(
r -> r.getTestCase().getType(),
Collectors.collectingAndThen(
Collectors.toList(),
list -> {
long p = list.stream().filter(TestCaseResult::isPassed).count();
return (double) p / list.size();
}
)
));
return TestSuiteReport.builder()
.totalCases(testCases.size())
.passedCases((int) passed)
.failedCases((int) failed)
.overallPassRate((double) passed / testCases.size())
.passRateByType(passRateByType)
.results(results)
.failedCases(results.stream().filter(r -> !r.isPassed()).collect(Collectors.toList()))
.build();
}
private TestCaseResult runSingleTest(String systemPrompt, PromptTestCase tc) {
try {
String fullSystemPrompt = systemPrompt;
if (tc.getSystemContext() != null) {
fullSystemPrompt += "\n\n" + tc.getSystemContext();
}
// 调用被测试的模型
String actualOutput = testSubject.prompt()
.system(fullSystemPrompt)
.user(tc.getInput())
.call()
.content();
// 评估输出
boolean passed = evaluate(tc, actualOutput);
String failReason = passed ? null : buildFailReason(tc, actualOutput);
return TestCaseResult.builder()
.testCase(tc)
.actualOutput(actualOutput)
.passed(passed)
.failReason(failReason)
.build();
} catch (Exception e) {
return TestCaseResult.builder()
.testCase(tc)
.actualOutput("ERROR: " + e.getMessage())
.passed(false)
.failReason("测试执行异常: " + e.getMessage())
.build();
}
}
private boolean evaluate(PromptTestCase tc, String actualOutput) {
return switch (tc.getCriteria()) {
case EXACT_MATCH -> actualOutput.trim().equals(tc.getExactExpected().trim());
case CONTAINS_KEYWORDS -> {
boolean allContained = tc.getMustContain() == null ||
tc.getMustContain().stream().allMatch(kw ->
actualOutput.toLowerCase().contains(kw.toLowerCase()));
boolean noneExcluded = tc.getMustNotContain() == null ||
tc.getMustNotContain().stream().noneMatch(kw ->
actualOutput.toLowerCase().contains(kw.toLowerCase()));
yield allContained && noneExcluded;
}
case SEMANTIC_SIMILARITY -> {
float[] expectedEmb = embeddingModel.embed(tc.getExactExpected());
float[] actualEmb = embeddingModel.embed(actualOutput);
double similarity = cosineSimilarity(expectedEmb, actualEmb);
yield similarity > 0.85;
}
case LLM_AS_JUDGE -> {
String judgePrompt = String.format("""
请判断以下AI输出是否满足要求(输出PASS或FAIL):
用户输入:%s
期望标准:%s
实际输出:%s
只输出PASS或FAIL:
""", tc.getInput(), tc.getExactExpected(), actualOutput);
String judgment = judgeModel.prompt()
.user(judgePrompt).call().content().trim().toUpperCase();
yield judgment.startsWith("PASS");
}
default -> true;
};
}
private double cosineSimilarity(float[] a, float[] b) {
double dot = 0, na = 0, nb = 0;
for (int i = 0; i < a.length; i++) {
dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i];
}
return dot / (Math.sqrt(na) * Math.sqrt(nb));
}
private String buildFailReason(PromptTestCase tc, String actual) {
if (tc.getMustContain() != null) {
List<String> missing = tc.getMustContain().stream()
.filter(kw -> !actual.toLowerCase().contains(kw.toLowerCase()))
.collect(Collectors.toList());
if (!missing.isEmpty()) {
return "缺少必要关键词: " + String.join(", ", missing);
}
}
if (tc.getMustNotContain() != null) {
List<String> found = tc.getMustNotContain().stream()
.filter(kw -> actual.toLowerCase().contains(kw.toLowerCase()))
.collect(Collectors.toList());
if (!found.isEmpty()) {
return "包含了不应该有的内容: " + String.join(", ", found);
}
}
return "输出与期望不匹配";
}
}对比测试:量化Prompt改动的效果
/**
* Prompt版本对比测试
* 量化新旧版本的效果差异
*/
@Service
@RequiredArgsConstructor
public class PromptComparisonTester {
private final PromptTestRunner testRunner;
/**
* 对比两个版本的Prompt
* 返回详细的差异分析
*/
public PromptComparisonReport compare(
String baselinePrompt,
String candidatePrompt,
List<PromptTestCase> testCases) {
// 并发运行两个版本的测试
CompletableFuture<TestSuiteReport> baselineFuture = CompletableFuture
.supplyAsync(() -> testRunner.runTestSuite(baselinePrompt, testCases));
CompletableFuture<TestSuiteReport> candidateFuture = CompletableFuture
.supplyAsync(() -> testRunner.runTestSuite(candidatePrompt, testCases));
TestSuiteReport baseline = baselineFuture.join();
TestSuiteReport candidate = candidateFuture.join();
// 找出改善和退化的测试用例
List<TestCaseComparison> improvements = new ArrayList<>();
List<TestCaseComparison> regressions = new ArrayList<>();
for (int i = 0; i < testCases.size(); i++) {
TestCaseResult baseResult = baseline.getResults().get(i);
TestCaseResult candResult = candidate.getResults().get(i);
if (!baseResult.isPassed() && candResult.isPassed()) {
improvements.add(TestCaseComparison.of(testCases.get(i), baseResult, candResult));
} else if (baseResult.isPassed() && !candResult.isPassed()) {
regressions.add(TestCaseComparison.of(testCases.get(i), baseResult, candResult));
}
}
double passRateDelta = candidate.getOverallPassRate() - baseline.getOverallPassRate();
return PromptComparisonReport.builder()
.baselinePassRate(baseline.getOverallPassRate())
.candidatePassRate(candidate.getOverallPassRate())
.passRateDelta(passRateDelta)
.improvements(improvements)
.regressions(regressions)
.recommendation(buildRecommendation(passRateDelta, regressions.size()))
.build();
}
private String buildRecommendation(double delta, int regressionCount) {
if (delta > 0.05 && regressionCount == 0) {
return "强烈建议采用新版本:总体提升显著,无回退案例";
} else if (delta > 0.05 && regressionCount > 0) {
return String.format("建议权衡:总体提升%.1f%%,但有%d个回退案例需要关注",
delta * 100, regressionCount);
} else if (delta < -0.02) {
return "不建议采用新版本:总体效果下降";
} else {
return "效果相当,可以根据其他因素(可读性、可维护性)决定";
}
}
}CI集成:把Prompt测试放进流水线
# .github/workflows/prompt-tests.yml
name: Prompt Tests
on:
pull_request:
paths:
- 'src/main/resources/prompts/**'
jobs:
test-prompts:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Run Prompt Tests
run: |
mvn test -Dtest=PromptTestSuiteIT -Dspring.profiles.active=ci
env:
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
- name: Upload Test Report
uses: actions/upload-artifact@v3
with:
name: prompt-test-report
path: target/prompt-test-report.json// Prompt集成测试(放在CI中运行)
@SpringBootTest
@ActiveProfiles("ci")
class PromptTestSuiteIT {
@Autowired
private PromptTestRunner testRunner;
@Autowired
private PromptVersionService promptVersionService;
@Test
void testCustomerServicePrompt() {
String prompt = promptVersionService.getProductionPrompt("customer-service");
List<PromptTestCase> testCases = loadTestCasesFromFile(
"src/test/resources/prompt-tests/customer-service.json");
TestSuiteReport report = testRunner.runTestSuite(prompt, testCases);
// 必须所有高优先级(priority=1)的用例通过
long highPriorityFailed = report.getResults().stream()
.filter(r -> r.getTestCase().getPriority() == 1 && !r.isPassed())
.count();
assertThat(highPriorityFailed).isZero();
// 总体通过率不低于85%
assertThat(report.getOverallPassRate()).isGreaterThan(0.85);
}
}Prompt测试工程的价值在于把"感觉"变成"数据"。每次Prompt改动都跑一遍测试,立刻知道改动是进步还是退步,哪些案例改善了,哪些案例退化了。
这是Prompt工程从艺术走向工程的关键一步。
