第2167篇:自动化回归测试套件——确保每次Prompt更新不破坏已有功能
2026/4/30大约 5 分钟
第2167篇:自动化回归测试套件——确保每次Prompt更新不破坏已有功能
适读人群:维护LLM系统的工程师 | 阅读时长:约17分钟 | 核心价值:建立LLM系统的自动化回归测试体系,让每次变更都有安全网,告别"上线后才发现搞坏了别的功能"
"新Prompt上线了,但A功能修好了,B功能又出问题了。"
这是软件开发里的"打地鼠"现象,在LLM系统里尤其常见。
原因很简单:LLM Prompt是全局的,一个System Prompt会影响所有场景的输出。你针对某类问题调优了Prompt,可能不知不觉地改变了对其他类问题的处理。
传统软件用单元测试和集成测试做回归保护。LLM系统同样需要,但测试对象不是"功能是否正确执行",而是"输出质量是否达标"。
LLM回归测试的核心设计
LLM回归测试的挑战:
1. 非确定性:同样输入,不同运行结果可能不同
2. 开放式输出:没有固定的"正确答案"用于精确匹配
3. 评估成本:每次跑测试需要调用LLM+评估器
解决策略:
1. 用模糊匹配 + 语义相似度,而不是字符串精确匹配
2. 用硬性规则检查(关键词/格式/安全)保证最低质量
3. 用质量分数阈值做通过/失败判断
4. 采样+缓存,控制每次回归的成本回归测试框架实现
/**
* LLM自动化回归测试框架
*
* 集成到CI/CD流程,每次Prompt变更自动运行
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class LlmRegressionTestRunner {
private final RegressionTestSuiteRepository suiteRepository;
private final LlmEvaluationService evaluationService;
private final EmbeddingModel embeddingModel;
private final TestResultRepository resultRepository;
/**
* 运行完整的回归测试套件
*
* @param promptVersion 要测试的Prompt版本
* @param suiteId 测试套件ID
* @return 测试报告
*/
public RegressionTestReport runSuite(String promptVersion, String suiteId) {
RegressionTestSuite suite = suiteRepository.findById(suiteId)
.orElseThrow(() -> new NotFoundException("测试套件不存在: " + suiteId));
log.info("开始回归测试: 套件={}, Prompt版本={}, 测试用例数={}",
suiteId, promptVersion, suite.getTestCases().size());
long startTime = System.currentTimeMillis();
List<TestCaseResult> results = new ArrayList<>();
int passedCount = 0, failedCount = 0;
for (RegressionTestCase tc : suite.getTestCases()) {
TestCaseResult result = runSingleTestCase(tc, promptVersion);
results.add(result);
if (result.isPassed()) passedCount++;
else failedCount++;
// 遇到Critical失败,立即停止(快速失败)
if (result.isCriticalFailure() && suite.isFailFast()) {
log.error("回归测试遇到Critical失败,停止测试: {}", tc.getId());
break;
}
}
long duration = System.currentTimeMillis() - startTime;
double passRate = (double) passedCount / results.size();
RegressionTestReport report = RegressionTestReport.builder()
.suiteId(suiteId)
.promptVersion(promptVersion)
.timestamp(Instant.now())
.totalCount(results.size())
.passedCount(passedCount)
.failedCount(failedCount)
.passRate(passRate)
.durationMs(duration)
.testResults(results)
.overallPassed(passRate >= suite.getRequiredPassRate())
.failureSummary(summarizeFailures(results))
.build();
resultRepository.save(report);
log.info("回归测试完成: 通过率={:.1f}%(需要{:.1f}%), 耗时={}ms",
passRate * 100, suite.getRequiredPassRate() * 100, duration);
return report;
}
/**
* 运行单个测试用例
*/
private TestCaseResult runSingleTestCase(RegressionTestCase tc, String promptVersion) {
long startTime = System.currentTimeMillis();
try {
// 1. 调用LLM
String output = callLlmWithVersion(promptVersion, tc.getInput(), tc.getContext());
long latencyMs = System.currentTimeMillis() - startTime;
// 2. 执行所有断言
List<AssertionResult> assertionResults = new ArrayList<>();
for (TestAssertion assertion : tc.getAssertions()) {
AssertionResult ar = executeAssertion(assertion, tc.getInput(), output);
assertionResults.add(ar);
}
// 3. 计算综合评估分数
EvaluationReport evalReport = evaluationService.evaluate(
EvaluationRequest.builder()
.userInput(tc.getInput())
.llmOutput(output)
.context(tc.getContext())
.build()
);
// 4. 判断是否通过
boolean allHardChecksPassed = assertionResults.stream()
.filter(ar -> ar.getAssertion().isCritical())
.allMatch(AssertionResult::isPassed);
boolean qualityThresholdMet = evalReport.getOverallScore() >= tc.getMinQualityScore();
boolean passed = allHardChecksPassed && qualityThresholdMet;
boolean critical = !allHardChecksPassed; // Critical断言失败才是Critical
return TestCaseResult.builder()
.testCaseId(tc.getId())
.testCaseName(tc.getName())
.input(tc.getInput())
.output(output)
.passed(passed)
.criticalFailure(critical)
.latencyMs(latencyMs)
.qualityScore(evalReport.getOverallScore())
.assertionResults(assertionResults)
.failureReason(passed ? null : buildFailureReason(assertionResults, evalReport, tc.getMinQualityScore()))
.build();
} catch (Exception e) {
log.error("测试用例执行失败: {}", tc.getId(), e);
return TestCaseResult.error(tc, e.getMessage());
}
}
/**
* 执行单个断言
*/
private AssertionResult executeAssertion(TestAssertion assertion, String input, String output) {
return switch (assertion.getType()) {
case CONTAINS -> {
boolean passed = output.contains(assertion.getExpectedValue());
yield AssertionResult.of(assertion, passed,
passed ? null : "输出未包含期望内容: " + assertion.getExpectedValue());
}
case NOT_CONTAINS -> {
boolean passed = !output.contains(assertion.getExpectedValue());
yield AssertionResult.of(assertion, passed,
passed ? null : "输出包含了禁止内容: " + assertion.getExpectedValue());
}
case MATCHES_PATTERN -> {
boolean passed = output.matches(assertion.getExpectedValue());
yield AssertionResult.of(assertion, passed,
passed ? null : "输出不匹配正则: " + assertion.getExpectedValue());
}
case SEMANTIC_SIMILARITY -> {
float[] outputEmb = embeddingModel.embed(output);
float[] expectedEmb = embeddingModel.embed(assertion.getExpectedValue());
double similarity = cosineSimilarity(outputEmb, expectedEmb);
double threshold = assertion.getThreshold() != null ? assertion.getThreshold() : 0.7;
boolean passed = similarity >= threshold;
yield AssertionResult.of(assertion, passed, passed ? null :
String.format("语义相似度=%.3f,低于阈值%.3f", similarity, threshold));
}
case MAX_LENGTH -> {
int maxLen = Integer.parseInt(assertion.getExpectedValue());
boolean passed = output.length() <= maxLen;
yield AssertionResult.of(assertion, passed,
passed ? null : "输出长度" + output.length() + "超过限制" + maxLen);
}
case JSON_VALID -> {
boolean passed = isValidJson(output);
yield AssertionResult.of(assertion, passed,
passed ? null : "输出不是有效的JSON格式");
}
case LANGUAGE -> {
boolean isChinese = output.chars().filter(c -> c >= 0x4E00 && c <= 0x9FA5).count()
> output.length() * 0.3;
boolean expectedChinese = "zh".equals(assertion.getExpectedValue());
boolean passed = isChinese == expectedChinese;
yield AssertionResult.of(assertion, passed,
passed ? null : "输出语言不符合预期");
}
};
}
private String buildFailureReason(List<AssertionResult> assertions,
EvaluationReport eval, double minScore) {
List<String> reasons = new ArrayList<>();
assertions.stream()
.filter(ar -> !ar.isPassed())
.forEach(ar -> reasons.add("断言失败[" + ar.getAssertion().getType() + "]: " + ar.getFailureReason()));
if (eval.getOverallScore() < minScore) {
reasons.add(String.format("质量分%.3f低于阈值%.3f", eval.getOverallScore(), minScore));
}
return String.join("; ", reasons);
}
private RegressionFailureSummary summarizeFailures(List<TestCaseResult> results) {
List<TestCaseResult> failed = results.stream().filter(r -> !r.isPassed()).collect(Collectors.toList());
// 按失败原因分类统计
Map<String, Long> failuresByType = failed.stream()
.flatMap(r -> r.getAssertionResults().stream()
.filter(ar -> !ar.isPassed())
.map(ar -> ar.getAssertion().getType().name()))
.collect(Collectors.groupingBy(t -> t, Collectors.counting()));
return RegressionFailureSummary.builder()
.totalFailed(failed.size())
.criticalFailures((int) results.stream().filter(TestCaseResult::isCriticalFailure).count())
.failuresByType(failuresByType)
.topFailedCases(failed.subList(0, Math.min(5, failed.size())))
.build();
}
private double cosineSimilarity(float[] a, float[] b) {
double dot = 0, na = 0, nb = 0;
for (int i = 0; i < a.length; i++) { dot += a[i]*b[i]; na += a[i]*a[i]; nb += b[i]*b[i]; }
return dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-8);
}
private boolean isValidJson(String s) {
try { new com.fasterxml.jackson.databind.ObjectMapper().readTree(s); return true; }
catch (Exception e) { return false; }
}
private String callLlmWithVersion(String promptVersion, String input, String context) {
return ""; // placeholder
}
}CI/CD集成
/**
* CI/CD集成入口
*
* 作为Maven/Gradle测试的一部分运行,
* 测试失败则阻止部署
*/
@SpringBootTest
@RequiredArgsConstructor
class LlmRegressionTest {
@Autowired
private LlmRegressionTestRunner testRunner;
@Value("${regression.prompt-version}")
private String promptVersion;
@Test
void regressionSuite_ShouldPassAllCriticalChecks() {
RegressionTestReport report = testRunner.runSuite(promptVersion, "main-regression-suite");
// 如果有Critical失败,测试立即失败(阻止部署)
assertThat(report.getTestResults())
.filteredOn(TestCaseResult::isCriticalFailure)
.as("不应该有Critical级别的回归失败")
.isEmpty();
// 整体通过率要达标
assertThat(report.getPassRate())
.as("回归测试通过率应该 >= 90%")
.isGreaterThanOrEqualTo(0.90);
log.info("回归测试通过: 通过率={:.1f}%", report.getPassRate() * 100);
}
}测试套件的维护策略
测试套件应该持续更新,有三种情况需要添加新测试用例:
- 发现Bug时:每次修复了一个Prompt问题,把那个出问题的用例加进测试套件,防止同类问题复发
- 新功能上线时:新场景上线,同步添加覆盖该场景的回归测试
- 定期审查时:每月review一次测试套件,删除过时的用例,更新失效的断言
测试套件本身也是一种"文档"——它记录了系统的所有预期行为,新团队成员看测试套件能快速了解系统的规格。
