第1718篇:AI功能的A/B测试基础设施——实验平台的设计与统计显著性
第1718篇:AI功能的A/B测试基础设施——实验平台的设计与统计显著性
有次参加一个产品会议,产品经理说"新Prompt上线后用户满意度明显提升了"。我问他怎么知道是Prompt的功劳,他说:因为上线之后好评变多了。
我当时没说话,因为我知道他的意思是:上个月上线了新Prompt,而且上个月好评率确实涨了。但同期还上线了UI优化,还赶上了双十一促销……
这就是为什么AI功能需要严格的A/B测试,而不是"感觉上去了"。
一、AI A/B测试的特殊性
普通功能的A/B测试已经有很多成熟方案。但AI功能有几个特殊性让这件事更复杂:
特殊性1:评估维度多 转化率、点击率这些业务指标只是一部分。AI特有的还有:回答质量、幻觉率、用户编辑率(用户收到回答后多大程度在修改)、对话轮次(达成目标需要多少轮对话)。
特殊性2:效果延迟 有些AI功能的效果不是立竿见影的。用户今天用AI生成了一份报告,这份报告好不好可能几天后才能从业务反馈里看出来。
特殊性3:交互偏差 Prompt的改动可能影响用户行为,用户行为的改变又反过来影响指标,很难分清因果。
特殊性4:小样本困境 AI功能的调用量可能远低于普通功能,需要更长时间才能积累足够的样本达到统计显著性。
二、实验平台核心架构
三、分流器设计
分流的核心要求:同一个用户在同一个实验里每次都被分到同一个组(粘性)。
@Service
public class ExperimentAssigner {
private final ExperimentRepository experimentRepo;
private final UserAssignmentStore assignmentStore;
// 为用户分配实验组
public ExperimentAssignment assign(String userId, String experimentId) {
// 先检查缓存,确保同一用户的分配粘性
Optional<ExperimentAssignment> existing = assignmentStore.find(userId, experimentId);
if (existing.isPresent()) {
return existing.get();
}
Experiment experiment = experimentRepo.findById(experimentId)
.orElseThrow(() -> new ExperimentNotFoundException(experimentId));
// 检查实验是否启用
if (!experiment.isActive()) {
return ExperimentAssignment.control(userId, experimentId);
}
// 用哈希方式决定分组(确保确定性)
ExperimentVariant variant = determineVariant(userId, experiment);
ExperimentAssignment assignment = ExperimentAssignment.builder()
.userId(userId)
.experimentId(experimentId)
.variantId(variant.getId())
.variantName(variant.getName())
.assignedAt(Instant.now())
.build();
assignmentStore.save(assignment);
return assignment;
}
private ExperimentVariant determineVariant(String userId, Experiment experiment) {
// 使用一致性哈希,加上盐值避免不同实验间的相关性
String hashKey = userId + ":" + experiment.getId() + ":" + experiment.getSalt();
int hash = Math.abs(DigestUtils.md5DigestAsHex(hashKey.getBytes()).hashCode());
int bucket = hash % 100; // 0-99
// 根据流量分配找到对应的变体
int cumulative = 0;
for (ExperimentVariant variant : experiment.getVariants()) {
cumulative += variant.getTrafficPercentage();
if (bucket < cumulative) {
return variant;
}
}
// 默认返回对照组
return experiment.getControlVariant();
}
// 检查用户是否在实验白名单(用于内部测试)
public boolean isInWhitelist(String userId, String experimentId) {
return experimentRepo.findById(experimentId)
.map(exp -> exp.getWhitelist().contains(userId))
.orElse(false);
}
}实验定义的数据结构:
@Data
@Builder
public class Experiment {
private String id;
private String name;
private String description;
private boolean active;
private String salt; // 随机盐,确保不同实验的分流独立
private List<ExperimentVariant> variants;
private Set<String> whitelist; // 白名单(内部测试用)
// 实验的度量目标
private List<MetricDefinition> primaryMetrics; // 主要指标(决策依据)
private List<MetricDefinition> guardrailMetrics; // 护栏指标(不能恶化)
private Instant startTime;
private Instant endTime;
private int minimumSampleSize; // 最小样本量(达到后才开始分析)
}
@Data
@Builder
public class ExperimentVariant {
private String id;
private String name; // "control", "treatment-a", "treatment-b"
private int trafficPercentage; // 流量占比(各variant之和必须=100)
private PromptConfig promptConfig; // 这个variant使用的Prompt配置
}四、事件收集与指标定义
// 事件类型定义
public enum AiEventType {
AI_RESPONSE_SHOWN, // AI回答展示给用户
USER_ACCEPTED_RESPONSE, // 用户接受了AI回答(未修改直接使用)
USER_EDITED_RESPONSE, // 用户修改了AI回答
USER_REJECTED_RESPONSE, // 用户拒绝了AI回答(重新生成或关闭)
USER_THUMBS_UP, // 用户点赞
USER_THUMBS_DOWN, // 用户点踩
USER_FOLLOW_UP_QUESTION, // 用户继续追问(说明没解决问题)
TASK_COMPLETED, // 用户完成了目标任务
CONVERSATION_ABANDONED // 用户中途放弃
}
@Data
@Builder
public class ExperimentEvent {
private String eventId;
private String userId;
private String sessionId;
private String experimentId;
private String variantId;
private AiEventType eventType;
private Map<String, Object> properties; // 额外属性
private Instant timestamp;
}
// 事件收集器
@Service
public class ExperimentEventCollector {
private final KafkaTemplate<String, ExperimentEvent> kafka;
private static final String TOPIC = "experiment-events";
public void collect(ExperimentEvent event) {
kafka.send(TOPIC, event.getUserId(), event);
}
// 便捷方法
public void recordResponseShown(String userId, String sessionId,
String experimentId, String variantId,
String responseId, int responseLength) {
collect(ExperimentEvent.builder()
.eventId(UUID.randomUUID().toString())
.userId(userId)
.sessionId(sessionId)
.experimentId(experimentId)
.variantId(variantId)
.eventType(AiEventType.AI_RESPONSE_SHOWN)
.properties(Map.of(
"response_id", responseId,
"response_length", responseLength
))
.timestamp(Instant.now())
.build());
}
public void recordUserAccepted(String userId, String sessionId,
String experimentId, String variantId) {
collect(ExperimentEvent.builder()
.eventId(UUID.randomUUID().toString())
.userId(userId)
.sessionId(sessionId)
.experimentId(experimentId)
.variantId(variantId)
.eventType(AiEventType.USER_ACCEPTED_RESPONSE)
.timestamp(Instant.now())
.build());
}
}五、统计显著性分析
这是整个A/B测试里最容易出错的部分。我见过很多团队只看"提升了多少百分比",根本不看统计显著性,然后在噪声里做决策。
@Service
public class StatisticalAnalyzer {
/**
* 对二元指标(转化/不转化)进行双样本比例检验(Z检验)
*
* @param controlEvents 对照组的总曝光量和转化量
* @param treatmentEvents 实验组的总曝光量和转化量
* @return 统计检验结果
*/
public ProportionTestResult testProportions(GroupMetrics control,
GroupMetrics treatment) {
int n1 = control.getTotalCount();
int x1 = control.getConvertedCount();
int n2 = treatment.getTotalCount();
int x2 = treatment.getConvertedCount();
if (n1 == 0 || n2 == 0) {
return ProportionTestResult.insufficientData();
}
double p1 = (double) x1 / n1; // 对照组转化率
double p2 = (double) x2 / n2; // 实验组转化率
// 合并转化率(用于计算标准误)
double p = (double) (x1 + x2) / (n1 + n2);
// Z统计量
double standardError = Math.sqrt(p * (1 - p) * (1.0 / n1 + 1.0 / n2));
if (standardError == 0) {
return ProportionTestResult.noVariation();
}
double z = (p2 - p1) / standardError;
// 双尾p值(使用正态分布)
double pValue = 2 * (1 - normalCDF(Math.abs(z)));
// 置信区间(95%)
double margin = 1.96 * Math.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2);
double relativeDiff = (p2 - p1) / p1;
return ProportionTestResult.builder()
.controlRate(p1)
.treatmentRate(p2)
.absoluteDiff(p2 - p1)
.relativeDiff(relativeDiff)
.zScore(z)
.pValue(pValue)
.isSignificant(pValue < 0.05) // 5%显著性水平
.confidenceLevel(0.95)
.lowerBound(relativeDiff - margin / p1)
.upperBound(relativeDiff + margin / p1)
.controlSampleSize(n1)
.treatmentSampleSize(n2)
.build();
}
/**
* 计算达到统计显著性所需的最小样本量
*
* @param baselineRate 基准转化率
* @param mde 最小可检测效应(Minimum Detectable Effect)
* @param alpha 显著性水平(通常0.05)
* @param power 统计功效(通常0.8)
*/
public SampleSizeCalculation calculateRequiredSampleSize(
double baselineRate, double mde, double alpha, double power) {
// z_alpha/2 和 z_beta 的临界值
double zAlpha = 1.96; // for alpha=0.05 (two-tailed)
double zBeta = 0.842; // for power=0.8
double p1 = baselineRate;
double p2 = baselineRate * (1 + mde); // 实验组的预期转化率
double p = (p1 + p2) / 2;
int n = (int) Math.ceil(
Math.pow(zAlpha * Math.sqrt(2 * p * (1 - p)) +
zBeta * Math.sqrt(p1 * (1 - p1) + p2 * (1 - p2)), 2)
/ Math.pow(p2 - p1, 2)
);
return SampleSizeCalculation.builder()
.perGroupSampleSize(n)
.totalSampleSize(n * 2)
.baselineRate(baselineRate)
.minimumDetectableEffect(mde)
.expectedTreatmentRate(p2)
.build();
}
// 标准正态分布CDF(近似值)
private double normalCDF(double x) {
return 0.5 * (1 + erf(x / Math.sqrt(2)));
}
private double erf(double x) {
// Horner's method approximation
double t = 1.0 / (1.0 + 0.5 * Math.abs(x));
double tau = t * Math.exp(-x * x - 1.26551223 +
t * (1.00002368 + t * (0.37409196 + t * (0.09678418 +
t * (-0.18628806 + t * (0.27886807 + t * (-1.13520398 +
t * (1.48851587 + t * (-0.82215223 + t * 0.17087294)))))))));
return x >= 0 ? 1 - tau : tau - 1;
}
}六、指标聚合与报告生成
@Service
public class ExperimentMetricsAggregator {
private final JdbcTemplate jdbcTemplate;
private final StatisticalAnalyzer analyzer;
public ExperimentReport generateReport(String experimentId) {
Experiment experiment = experimentRepo.findById(experimentId)
.orElseThrow();
// 聚合各组的指标数据
Map<String, GroupMetrics> groupMetrics = new HashMap<>();
for (ExperimentVariant variant : experiment.getVariants()) {
groupMetrics.put(variant.getId(), aggregateMetrics(experimentId, variant.getId()));
}
ExperimentVariant control = experiment.getControlVariant();
GroupMetrics controlMetrics = groupMetrics.get(control.getId());
// 对每个实验组进行统计检验
List<VariantAnalysis> analyses = new ArrayList<>();
for (ExperimentVariant variant : experiment.getVariants()) {
if (variant.getId().equals(control.getId())) continue;
GroupMetrics treatmentMetrics = groupMetrics.get(variant.getId());
// 检验主要指标
Map<String, ProportionTestResult> primaryResults = new HashMap<>();
for (MetricDefinition metric : experiment.getPrimaryMetrics()) {
GroupMetrics controlForMetric = extractMetric(controlMetrics, metric);
GroupMetrics treatmentForMetric = extractMetric(treatmentMetrics, metric);
primaryResults.put(metric.getName(),
analyzer.testProportions(controlForMetric, treatmentForMetric));
}
// 检验护栏指标(不能恶化)
Map<String, ProportionTestResult> guardrailResults = new HashMap<>();
for (MetricDefinition metric : experiment.getGuardrailMetrics()) {
GroupMetrics controlForMetric = extractMetric(controlMetrics, metric);
GroupMetrics treatmentForMetric = extractMetric(treatmentMetrics, metric);
ProportionTestResult guardrailTest =
analyzer.testProportions(controlForMetric, treatmentForMetric);
guardrailResults.put(metric.getName(), guardrailTest);
}
// 判断护栏是否触发
boolean guardrailViolated = guardrailResults.values().stream()
.anyMatch(r -> r.isSignificant() && r.getRelativeDiff() < -0.05);
analyses.add(VariantAnalysis.builder()
.variantId(variant.getId())
.variantName(variant.getName())
.sampleSize(treatmentMetrics.getTotalCount())
.primaryMetricResults(primaryResults)
.guardrailResults(guardrailResults)
.guardrailViolated(guardrailViolated)
.recommendation(generateRecommendation(primaryResults, guardrailViolated))
.build());
}
return ExperimentReport.builder()
.experimentId(experimentId)
.experimentName(experiment.getName())
.generatedAt(Instant.now())
.controlSampleSize(controlMetrics.getTotalCount())
.analyses(analyses)
.overallRecommendation(determineOverallRecommendation(analyses))
.build();
}
private GroupMetrics aggregateMetrics(String experimentId, String variantId) {
// 从事件表聚合
String sql = """
SELECT
COUNT(*) as total_shown,
SUM(CASE WHEN event_type = 'USER_ACCEPTED_RESPONSE' THEN 1 ELSE 0 END) as accepted,
SUM(CASE WHEN event_type = 'USER_EDITED_RESPONSE' THEN 1 ELSE 0 END) as edited,
SUM(CASE WHEN event_type = 'USER_REJECTED_RESPONSE' THEN 1 ELSE 0 END) as rejected,
SUM(CASE WHEN event_type = 'USER_THUMBS_UP' THEN 1 ELSE 0 END) as thumbs_up,
SUM(CASE WHEN event_type = 'USER_THUMBS_DOWN' THEN 1 ELSE 0 END) as thumbs_down,
SUM(CASE WHEN event_type = 'TASK_COMPLETED' THEN 1 ELSE 0 END) as task_completed
FROM experiment_events
WHERE experiment_id = ?
AND variant_id = ?
AND timestamp >= ?
""";
return jdbcTemplate.queryForObject(sql,
(rs, rowNum) -> GroupMetrics.builder()
.totalCount(rs.getInt("total_shown"))
.acceptedCount(rs.getInt("accepted"))
.editedCount(rs.getInt("edited"))
.rejectedCount(rs.getInt("rejected"))
.thumbsUpCount(rs.getInt("thumbs_up"))
.thumbsDownCount(rs.getInt("thumbs_down"))
.taskCompletedCount(rs.getInt("task_completed"))
.build(),
experimentId, variantId, experimentStartTime(experimentId)
);
}
}七、常见错误与防坑指南
错误1:多重检验问题(P-hacking)
当你同时测试10个指标时,即使实际上没有效果,按5%显著性水平,平均有0.5个指标会"显著"。
解决方案:使用Bonferroni校正或Benjamini-Hochberg校正:
// Bonferroni校正:把显著性水平除以检验次数
public double bonferroniThreshold(int numberOfTests, double alpha) {
return alpha / numberOfTests;
}
// 使用时
double adjustedAlpha = bonferroniThreshold(primaryMetrics.size(), 0.05);
boolean significant = pValue < adjustedAlpha;错误2:过早停止实验(Peeking Problem)
很多人习惯每天看实验数据,一看到显著差异就停止实验。但这会大幅增加误报率。
解决方案:事先计算所需样本量,等达到后才做决策;或者使用Sequential Testing方法(如O'Brien-Fleming边界)。
// 提前停止保护:只有达到预设样本量才允许查看结果
public boolean canAnalyze(ExperimentProgress progress) {
return progress.getMinGroupSize() >= progress.getRequiredSampleSize();
}错误3:辛普森悖论
如果实验组和对照组的用户构成不同(比如实验组更多是高频用户),整体指标可能被误导。
解决方案:按用户段分层分析,确保各组的用户构成可比。
错误4:护栏指标不够
只关注正向指标(提升),忽视了可能被损害的指标(比如响应质量提升了,但延迟也增加了)。一定要定义护栏指标,任何护栏指标的显著恶化都应该阻止发布。
八、与Spring Boot集成
@RestController
@RequestMapping("/api/v1/ai")
public class AiAnalysisController {
@Autowired
private SentimentAnalysisService sentimentService;
@Autowired
private ExperimentAssigner assigner;
@Autowired
private ExperimentEventCollector eventCollector;
@PostMapping("/analyze")
public ResponseEntity<AnalysisResponse> analyze(
@RequestBody AnalysisRequest request,
@RequestHeader("X-User-Id") String userId) {
// 分流:决定这个用户用哪个Prompt
ExperimentAssignment assignment = assigner.assign(userId, "prompt-experiment-v3");
// 获取实验组配置的Prompt
PromptConfig promptConfig = getPromptConfig(assignment.getVariantId());
// 执行AI分析
long start = System.currentTimeMillis();
AnalysisResponse response = sentimentService.analyzeWithConfig(
request, promptConfig
);
long latency = System.currentTimeMillis() - start;
// 记录曝光事件
eventCollector.recordResponseShown(
userId, request.getSessionId(),
assignment.getExperimentId(),
assignment.getVariantId(),
response.getRequestId(),
response.getContent().length()
);
// 在响应头里带上实验信息(方便调试)
return ResponseEntity.ok()
.header("X-Experiment-Id", assignment.getExperimentId())
.header("X-Variant-Id", assignment.getVariantId())
.body(response);
}
@PostMapping("/feedback")
public ResponseEntity<Void> feedback(
@RequestBody FeedbackRequest feedback,
@RequestHeader("X-User-Id") String userId) {
// 根据反馈类型记录事件
ExperimentAssignment assignment = assigner.getExistingAssignment(
userId, feedback.getExperimentId()
);
if (assignment != null) {
if (feedback.isThumbsUp()) {
eventCollector.recordThumbsUp(userId, feedback.getSessionId(),
assignment.getExperimentId(), assignment.getVariantId());
} else if (feedback.isEdited()) {
eventCollector.recordUserEdited(userId, feedback.getSessionId(),
assignment.getExperimentId(), assignment.getVariantId());
}
}
return ResponseEntity.ok().build();
}
}总结
AI功能的A/B测试需要比普通功能更严格的态度:
- 事先计算样本量,不要边看边决策
- 定义好主要指标和护栏指标,不能只看好的一面
- 做统计显著性检验,不要只看绝对数字
- 保持实验足够长的时间,避免新奇效应的干扰
- 分层分析,确认效果在不同用户群体里都成立
"感觉提升了"在产品讨论会上是可以接受的,但作为工程师,你需要的是能用数据说话的答案。
