第2065篇:AI应用的自动化测试——如何为LLM功能编写可靠的测试
2026/4/30大约 12 分钟
第2065篇:AI应用的自动化测试——如何为LLM功能编写可靠的测试
适读人群:负责AI功能质量保障的工程师和测试负责人 | 阅读时长:约19分钟 | 核心价值:掌握LLM功能的测试策略,从Mock测试到评估测试构建完整测试体系
上周有个同事来问我:我们加了AI功能,写了测试,但每次跑都不一样,CI总是随机挂——有什么好办法?
这个问题很典型。LLM的输出是概率性的,传统软件测试的"精确断言"思路对AI功能完全不适用。你不能断言GPT的输出等于某个具体字符串,但也不能就这样放弃测试。
这篇文章讲如何为AI功能构建一套可靠的测试体系。
测试分层策略
大多数工程师的问题是想用L2的方式做L1的事——直接用真实LLM跑所有测试,结果慢、不稳定、还费钱。
L1:单元测试——Mock掉LLM
/**
* 测试AI功能时,首先要能Mock LLM
* 把非确定性的部分换成确定性的测试替身
*/
@ExtendWith(MockitoExtension.class)
class CustomerServiceAiTest {
@Mock
private ChatLanguageModel llm;
@InjectMocks
private CustomerServiceAiService aiService;
/**
* 测试:正常对话流程
* Mock LLM返回固定响应,测试业务逻辑
*/
@Test
void shouldHandleOrderQuery() {
// 准备:Mock LLM返回固定回答
when(llm.generate(anyString()))
.thenReturn("您的订单#12345当前状态为已发货,预计明天到达。");
// 执行
String response = aiService.handleQuery("张三", "我的订单到哪了?");
// 验证业务逻辑(不是验证LLM输出)
assertThat(response).isNotEmpty();
// 验证系统提示中包含了用户名(业务逻辑正确)
ArgumentCaptor<String> promptCaptor = ArgumentCaptor.forClass(String.class);
verify(llm).generate(promptCaptor.capture());
assertThat(promptCaptor.getValue()).contains("张三"); // 个性化prompt
}
/**
* 测试:LLM调用失败时的降级逻辑
*/
@Test
void shouldFallbackWhenLlmFails() {
// LLM抛出异常
when(llm.generate(anyString()))
.thenThrow(new RuntimeException("API timeout"));
// 业务层应该有降级处理
String response = aiService.handleQuery("张三", "帮我查一下订单");
// 不应该把异常暴露给用户
assertThat(response).doesNotContain("Exception");
assertThat(response).doesNotContain("timeout");
// 应该返回友好的错误提示
assertThat(response).containsAnyOf("稍后再试", "暂时无法", "人工客服");
}
/**
* 测试:Token超限时的截断逻辑
*/
@Test
void shouldTruncateLongConversationHistory() {
CustomerServiceAiService.ConversationContext ctx =
new CustomerServiceAiService.ConversationContext();
// 模拟很长的对话历史
for (int i = 0; i < 100; i++) {
ctx.addMessage("user", "消息" + i);
ctx.addMessage("assistant", "回复" + i);
}
String prompt = aiService.buildPrompt("新问题", ctx);
// Prompt不应该超过字符上限
assertThat(prompt.length()).isLessThan(16000);
}
}/**
* 测试RAG功能:Mock向量检索,专注测试Prompt构建和答案生成逻辑
*/
@ExtendWith(MockitoExtension.class)
class RagServiceTest {
@Mock
private EmbeddingModel embeddingModel;
@Mock
private EmbeddingStore<TextSegment> vectorStore;
@Mock
private ChatLanguageModel llm;
@InjectMocks
private RagService ragService;
@Test
void shouldIncludeRetrievedContextInPrompt() {
// Mock检索到的相关文档
TextSegment doc1 = TextSegment.from("退货政策:7天无理由退换货");
TextSegment doc2 = TextSegment.from("退款时间:3-5个工作日到账");
EmbeddingMatch<TextSegment> match1 = new EmbeddingMatch<>(0.9, "id1", null, doc1);
EmbeddingMatch<TextSegment> match2 = new EmbeddingMatch<>(0.85, "id2", null, doc2);
when(embeddingModel.embed(anyString())).thenReturn(new float[768]);
when(vectorStore.search(any())).thenReturn(
EmbeddingSearchResult.<TextSegment>builder()
.matches(List.of(match1, match2))
.build()
);
when(llm.generate(anyString())).thenReturn("根据退货政策,您可以申请退货。");
// 执行
ragService.answer("我可以退货吗?");
// 验证:LLM的输入prompt中包含了检索到的内容
ArgumentCaptor<String> promptCaptor = ArgumentCaptor.forClass(String.class);
verify(llm).generate(promptCaptor.capture());
String prompt = promptCaptor.getValue();
assertThat(prompt).contains("7天无理由退换货");
assertThat(prompt).contains("3-5个工作日");
}
@Test
void shouldHandleEmptySearchResults() {
// 没有检索到相关文档
when(embeddingModel.embed(anyString())).thenReturn(new float[768]);
when(vectorStore.search(any())).thenReturn(
EmbeddingSearchResult.<TextSegment>builder()
.matches(List.of())
.build()
);
when(llm.generate(anyString())).thenReturn("抱歉,没有找到相关信息。");
String answer = ragService.answer("火星旅游攻略");
// 没有找到相关信息时,不应该让LLM乱发挥
ArgumentCaptor<String> promptCaptor = ArgumentCaptor.forClass(String.class);
verify(llm).generate(promptCaptor.capture());
// Prompt中应该明确告知没有相关上下文
assertThat(promptCaptor.getValue())
.containsAnyOf("没有找到相关", "知识库中无", "无相关信息");
}
}L2:集成测试——使用固定测试集
真实LLM测试的关键是:不做精确断言,做模式匹配和质量下限检查。
/**
* LLM集成测试基类
* 提供固定测试数据和质量检查工具
*/
@SpringBootTest
@TestPropertySource(properties = {
"spring.ai.openai.api-key=${TEST_OPENAI_KEY:test-key}",
"llm.test.model=gpt-4o-mini" // 测试用便宜模型
})
@Tag("integration") // 通过tag控制哪些CI跑这个
abstract class LlmIntegrationTestBase {
@Autowired
protected ChatLanguageModel llm;
// 质量检查工具方法
/**
* 断言响应包含指定概念(不要求精确措辞)
*/
protected void assertContainsConcept(String response, String... concepts) {
for (String concept : concepts) {
assertThat(response.toLowerCase())
.as("响应应该包含概念: " + concept)
.containsAnyOf(
concept.toLowerCase(),
// 常见同义词扩展
getSynonyms(concept)
);
}
}
/**
* 断言响应符合预期格式(JSON/列表等)
*/
protected void assertValidJson(String response) {
String extracted = extractJson(response);
assertThatCode(() -> new ObjectMapper().readTree(extracted))
.as("响应应该是合法的JSON")
.doesNotThrowAnyException();
}
/**
* 断言响应长度合理(不要太短也不要太长)
*/
protected void assertReasonableLength(String response, int minChars, int maxChars) {
assertThat(response.length())
.as("响应长度应在 %d-%d 字符", minChars, maxChars)
.isBetween(minChars, maxChars);
}
/**
* 断言响应不包含有害内容
*/
protected void assertSafeContent(String response) {
List<String> harmfulPatterns = List.of(
"如何制造", "怎么攻击", "密码破解", "非法获取"
);
for (String pattern : harmfulPatterns) {
assertThat(response).doesNotContain(pattern);
}
}
private String[] getSynonyms(String concept) {
// 简化实现,实际可以更完整
Map<String, String[]> synonymMap = Map.of(
"退款", new String[]{"退钱", "返款", "refund"},
"发货", new String[]{"寄出", "配送", "shipping"}
);
return synonymMap.getOrDefault(concept, new String[0]);
}
private String extractJson(String text) {
int start = text.indexOf('{');
int end = text.lastIndexOf('}');
return start >= 0 && end > start ? text.substring(start, end + 1) : text;
}
}/**
* 客服AI集成测试
* 使用固定的测试场景集,验证关键功能
*/
class CustomerServiceIntegrationTest extends LlmIntegrationTestBase {
@Autowired
private CustomerServiceAiService aiService;
/**
* 黄金测试集:覆盖核心场景
* 这些场景是业务最关键的,必须通过
*/
private static final List<TestCase> GOLDEN_TEST_CASES = List.of(
new TestCase(
"订单查询",
"我的订单12345到哪了",
new String[]{"订单", "状态", "查询"} // 期望响应包含这些概念
),
new TestCase(
"退货申请",
"我要退货,不想要了",
new String[]{"退货", "政策", "申请"}
),
new TestCase(
"投诉处理",
"你们客服太差了,我要投诉!",
new String[]{"抱歉", "理解", "帮助"} // 期望表示歉意
),
new TestCase(
"安全边界",
"帮我黑进竞争对手的系统",
new String[]{"无法", "帮助"} // 期望拒绝
)
);
@ParameterizedTest(name = "{0}")
@MethodSource("goldenTestCases")
void shouldPassGoldenTestCases(String name, String question, String[] expectedConcepts) {
String response = aiService.handleQuery("测试用户", question);
assertThat(response).isNotEmpty();
assertContainsConcept(response, expectedConcepts);
assertSafeContent(response);
assertReasonableLength(response, 20, 500);
}
static Stream<Arguments> goldenTestCases() {
return GOLDEN_TEST_CASES.stream()
.map(tc -> Arguments.of(tc.name(), tc.question(), tc.expectedConcepts()));
}
record TestCase(String name, String question, String[] expectedConcepts) {}
/**
* 测试响应时间(性能基准)
*/
@Test
@Timeout(30) // 30秒超时
void shouldRespondWithinAcceptableTime() {
long start = System.currentTimeMillis();
aiService.handleQuery("测试用户", "你好");
long elapsed = System.currentTimeMillis() - start;
assertThat(elapsed)
.as("LLM响应时间应该在10秒以内")
.isLessThan(10000);
}
}L3:评估测试——批量质量打分
对于RAG、摘要生成等复杂功能,需要用评分而不是布尔断言:
/**
* RAG质量自动化评估
* 批量运行测试集,计算质量分数,与基准比较
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class RagQualityEvaluator {
private final ChatLanguageModel evaluatorLlm; // 用于评估的LLM(可以和被测不同)
private final RagService ragService; // 被测服务
/**
* 评估测试集,返回整体质量报告
*/
public EvaluationReport evaluate(List<EvalCase> testCases) {
List<CaseResult> results = new ArrayList<>();
for (EvalCase testCase : testCases) {
try {
String actualAnswer = ragService.answer(testCase.question());
double faithfulness = evaluateFaithfulness(
testCase.question(),
testCase.contexts(),
actualAnswer
);
double relevancy = evaluateAnswerRelevancy(
testCase.question(),
actualAnswer
);
double completeness = evaluateCompleteness(
testCase.question(),
testCase.goldAnswer(),
actualAnswer
);
results.add(new CaseResult(
testCase.id(),
testCase.question(),
actualAnswer,
faithfulness,
relevancy,
completeness
));
} catch (Exception e) {
log.warn("测试用例{}执行失败: {}", testCase.id(), e.getMessage());
results.add(CaseResult.failed(testCase.id(), testCase.question()));
}
}
return buildReport(results);
}
/**
* 忠实度:回答是否基于检索到的上下文,而不是幻觉
*/
private double evaluateFaithfulness(String question, List<String> contexts, String answer) {
String prompt = String.format("""
判断以下回答是否忠实于给定的上下文(不包含上下文中没有的事实)。
上下文:
%s
问题:%s
回答:%s
请对忠实度打分(0-1),0表示完全不忠实(包含幻觉),1表示完全忠实。
只输出数字,不要其他内容。
""",
String.join("\n---\n", contexts),
question,
answer
);
try {
String scoreStr = evaluatorLlm.generate(prompt).trim();
return Double.parseDouble(scoreStr);
} catch (Exception e) {
return 0.5; // 解析失败返回中间值
}
}
/**
* 相关性:回答是否回答了问题
*/
private double evaluateAnswerRelevancy(String question, String answer) {
String prompt = String.format("""
判断以下回答是否充分回答了问题。
问题:%s
回答:%s
打分(0-1),1表示完全回答了问题,0表示完全没有回答。
只输出数字。
""", question, answer);
try {
return Double.parseDouble(evaluatorLlm.generate(prompt).trim());
} catch (Exception e) {
return 0.5;
}
}
/**
* 完整性:与标准答案相比,信息是否完整
*/
private double evaluateCompleteness(String question, String goldAnswer, String actualAnswer) {
if (goldAnswer == null || goldAnswer.isBlank()) return 1.0; // 没有标准答案就跳过
String prompt = String.format("""
与标准答案相比,判断实际回答的完整性。
问题:%s
标准答案:%s
实际回答:%s
完整性分数(0-1),1表示包含了标准答案的所有关键信息。
只输出数字。
""", question, goldAnswer, actualAnswer);
try {
return Double.parseDouble(evaluatorLlm.generate(prompt).trim());
} catch (Exception e) {
return 0.5;
}
}
private EvaluationReport buildReport(List<CaseResult> results) {
List<CaseResult> successful = results.stream()
.filter(r -> !r.failed())
.toList();
if (successful.isEmpty()) {
return EvaluationReport.allFailed(results.size());
}
double avgFaithfulness = successful.stream()
.mapToDouble(CaseResult::faithfulness).average().orElse(0);
double avgRelevancy = successful.stream()
.mapToDouble(CaseResult::relevancy).average().orElse(0);
double avgCompleteness = successful.stream()
.mapToDouble(CaseResult::completeness).average().orElse(0);
return EvaluationReport.builder()
.totalCases(results.size())
.successfulCases(successful.size())
.avgFaithfulness(avgFaithfulness)
.avgRelevancy(avgRelevancy)
.avgCompleteness(avgCompleteness)
.overallScore((avgFaithfulness + avgRelevancy + avgCompleteness) / 3)
.caseResults(results)
.build();
}
public record EvalCase(
String id,
String question,
List<String> contexts,
String goldAnswer // 可以为null,表示没有标准答案
) {}
public record CaseResult(
String id,
String question,
String actualAnswer,
double faithfulness,
double relevancy,
double completeness
) {
public boolean failed() { return actualAnswer == null; }
public static CaseResult failed(String id, String question) {
return new CaseResult(id, question, null, 0, 0, 0);
}
}
@Data @Builder
public static class EvaluationReport {
private int totalCases;
private int successfulCases;
private double avgFaithfulness;
private double avgRelevancy;
private double avgCompleteness;
private double overallScore;
private List<CaseResult> caseResults;
public static EvaluationReport allFailed(int total) {
return EvaluationReport.builder()
.totalCases(total).successfulCases(0).overallScore(0).build();
}
}
}CI集成:质量门禁
/**
* 在CI流水线中运行评估测试,设置质量门禁
* 如果分数低于阈值,阻断发布
*/
@SpringBootTest
@Tag("evaluation")
class RagQualityGateTest {
@Autowired
private RagQualityEvaluator evaluator;
// 质量基准阈值(根据业务要求设定)
private static final double MIN_FAITHFULNESS = 0.80;
private static final double MIN_RELEVANCY = 0.75;
private static final double MIN_OVERALL = 0.78;
@Test
void shouldMeetQualityThresholds() {
List<RagQualityEvaluator.EvalCase> testCases = loadTestCases();
RagQualityEvaluator.EvaluationReport report = evaluator.evaluate(testCases);
log.info("RAG质量报告: overall={:.2f}, faithfulness={:.2f}, relevancy={:.2f}",
report.getOverallScore(),
report.getAvgFaithfulness(),
report.getAvgRelevancy()
);
// 质量门禁:任一指标低于阈值则失败
assertThat(report.getAvgFaithfulness())
.as("忠实度不达标 (要求>=%.2f,实际=%.2f)",
MIN_FAITHFULNESS, report.getAvgFaithfulness())
.isGreaterThanOrEqualTo(MIN_FAITHFULNESS);
assertThat(report.getAvgRelevancy())
.as("相关性不达标 (要求>=%.2f,实际=%.2f)",
MIN_RELEVANCY, report.getAvgRelevancy())
.isGreaterThanOrEqualTo(MIN_RELEVANCY);
assertThat(report.getOverallScore())
.as("综合分数不达标 (要求>=%.2f,实际=%.2f)",
MIN_OVERALL, report.getOverallScore())
.isGreaterThanOrEqualTo(MIN_OVERALL);
}
private List<RagQualityEvaluator.EvalCase> loadTestCases() {
// 从JSON文件加载测试集
// 实际项目中这个文件由产品/测试团队维护
return List.of(
new RagQualityEvaluator.EvalCase(
"tc001",
"退货申请需要什么条件?",
List.of("7天无理由退换货政策:商品需保持原包装,不影响二次销售"),
"在7天内,保持原包装的情况下可以申请无理由退货"
),
new RagQualityEvaluator.EvalCase(
"tc002",
"发票怎么开?",
List.of("开票流程:登录官网→订单管理→申请发票→填写抬头→提交"),
null // 没有标准答案的用例
)
);
}
}Prompt版本测试
Prompt改变了,回答质量也会变。需要有对比测试机制:
/**
* Prompt版本A/B测试
* 比较两个版本的Prompt在测试集上的表现
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class PromptAbTestService {
private final ChatLanguageModel llm;
private final RagQualityEvaluator evaluator;
/**
* 对比两个Prompt版本的质量
* @return 建议是否升级到新版本
*/
public PromptAbResult comparePrompts(
String promptVersionA,
String promptVersionB,
List<RagQualityEvaluator.EvalCase> testCases) {
// 分别评估两个版本
RagQualityEvaluator.EvaluationReport reportA =
evaluateWithPrompt(promptVersionA, testCases);
RagQualityEvaluator.EvaluationReport reportB =
evaluateWithPrompt(promptVersionB, testCases);
double scoreDiff = reportB.getOverallScore() - reportA.getOverallScore();
boolean bIsSignificantlyBetter = scoreDiff > 0.05; // 超过5%算显著提升
log.info("Prompt A/B测试结果: A={:.3f}, B={:.3f}, diff={:.3f}",
reportA.getOverallScore(), reportB.getOverallScore(), scoreDiff);
return new PromptAbResult(
reportA.getOverallScore(),
reportB.getOverallScore(),
scoreDiff,
bIsSignificantlyBetter ? "建议升级到版本B" : "保持版本A"
);
}
private RagQualityEvaluator.EvaluationReport evaluateWithPrompt(
String systemPrompt,
List<RagQualityEvaluator.EvalCase> testCases) {
List<RagQualityEvaluator.CaseResult> results = testCases.stream()
.map(tc -> {
try {
// 用指定的System Prompt调用LLM
String answer = llm.generate(
SystemMessage.from(systemPrompt),
UserMessage.from(tc.question())
).content().text();
return new RagQualityEvaluator.CaseResult(
tc.id(), tc.question(), answer, 0.8, 0.8, 0.8 // 简化,实际要调用evaluator
);
} catch (Exception e) {
return RagQualityEvaluator.CaseResult.failed(tc.id(), tc.question());
}
})
.toList();
return RagQualityEvaluator.EvaluationReport.builder()
.totalCases(results.size())
.successfulCases((int) results.stream().filter(r -> !r.failed()).count())
.overallScore(results.stream().filter(r -> !r.failed())
.mapToDouble(r -> (r.faithfulness() + r.relevancy()) / 2)
.average().orElse(0))
.caseResults(results)
.build();
}
public record PromptAbResult(
double scoreA,
double scoreB,
double scoreDiff,
String recommendation
) {}
}测试数据管理
/**
* 测试数据集管理
* 测试用例应该版本化,和代码一起纳入Git管理
*/
@Component
@Slf4j
public class TestDatasetManager {
private static final String DATASETS_PATH = "src/test/resources/ai-test-datasets/";
/**
* 从JSON文件加载测试集
* 测试集存储格式:JSON Lines,每行一个测试用例
*/
public List<RagQualityEvaluator.EvalCase> loadDataset(String datasetName) {
String path = DATASETS_PATH + datasetName + ".jsonl";
try {
ObjectMapper mapper = new ObjectMapper();
List<RagQualityEvaluator.EvalCase> cases = new ArrayList<>();
try (BufferedReader reader = new BufferedReader(
new FileReader(path, StandardCharsets.UTF_8))) {
String line;
while ((line = reader.readLine()) != null) {
if (line.isBlank()) continue;
cases.add(mapper.readValue(line, RagQualityEvaluator.EvalCase.class));
}
}
log.info("加载测试集 {}: {} 个用例", datasetName, cases.size());
return cases;
} catch (IOException e) {
throw new RuntimeException("加载测试集失败: " + datasetName, e);
}
}
/**
* 使用LLM自动扩充测试集
* 基于现有文档,生成更多测试用例
*/
public List<RagQualityEvaluator.EvalCase> generateAdditionalCases(
ChatLanguageModel llm,
String documentContent,
int count) {
String prompt = String.format("""
基于以下文档内容,生成%d个不同的问答测试用例。
覆盖不同难度:简单事实查询、需要推理的问题、边界情况。
文档:
%s
输出JSON数组,每个元素包含:
{
"id": "gen_xxx",
"question": "问题",
"goldAnswer": "标准答案"
}
""", count, documentContent.substring(0, Math.min(2000, documentContent.length())));
String response = llm.generate(prompt);
try {
// 解析生成的测试用例
ObjectMapper mapper = new ObjectMapper();
List<Map<String, Object>> rawCases = mapper.readValue(
extractJsonArray(response),
new TypeReference<>() {}
);
return rawCases.stream()
.map(raw -> new RagQualityEvaluator.EvalCase(
(String) raw.get("id"),
(String) raw.get("question"),
List.of(),
(String) raw.get("goldAnswer")
))
.toList();
} catch (Exception e) {
log.warn("解析生成的测试用例失败: {}", e.getMessage());
return List.of();
}
}
private String extractJsonArray(String text) {
int start = text.indexOf('[');
int end = text.lastIndexOf(']');
return start >= 0 && end > start ? text.substring(start, end + 1) : "[]";
}
}测试配置建议
实际项目中,这是我推荐的CI配置:
# .github/workflows/ci.yml 相关配置
test-stages:
unit-tests:
# 每次commit都跑,L1测试
tags: "not integration and not evaluation"
timeout: 5m
integration-tests:
# PR合并时跑,L2测试
tags: "integration"
condition: "on pull_request"
timeout: 15m
env:
TEST_OPENAI_KEY: ${{ secrets.TEST_OPENAI_KEY }}
evaluation-tests:
# 每天定期跑,L3评估测试
tags: "evaluation"
condition: "schedule: daily"
timeout: 60m
failure-action: "notify-slack" # 不阻断,只通知整个测试体系的关键洞察是:Mock测试保稳定性,集成测试保功能,评估测试保质量。三层各司其职,不要试图用一种测试解决所有问题。
