第2118篇:AI数据飞轮——用用户反馈持续改善RAG系统质量
大约 9 分钟
第2118篇:AI数据飞轮——用用户反馈持续改善RAG系统质量
适读人群:运营中的RAG/AI助手系统工程师 | 阅读时长:约18分钟 | 核心价值:建立从用户反馈到系统改善的闭环,实现AI系统质量的持续提升
一个没有反馈闭环的AI系统是静止的。用户不满意,工程师不知道,系统不改变。
数据飞轮的思想是:每次用户与AI交互都产生有价值的数据,这些数据驱动系统改善,改善后系统能更好地服务用户,产生更多有价值的数据,形成正向循环。
对于RAG系统,这个飞轮很具体:
- 用户点了"没帮助"?→ 分析是检索问题还是生成问题 → 修复 → 验证修复效果
- 用户追问了同样的问题?→ 说明上次回答不完整 → 补充知识库
- 某类问题频繁被报告?→ 专项改善这个类别
听起来简单,但把这套流程系统化工程化,需要认真设计。
反馈信号的分类
/**
* 用户反馈信号分类
*
* ===== 显式反馈(用户主动表达)=====
*
* 强信号:
* - 点赞/点踩(有帮助/没帮助)
* - 文字反馈(填写具体原因)
* - 举报(有害内容、错误信息)
*
* 数量:少,但质量高
*
* ===== 隐式反馈(行为信号)=====
*
* 正向信号:
* - 复制了AI的回答内容
* - 对话继续且变长(说明AI在帮忙)
* - 相同主题继续深入(话题被展开)
*
* 负向信号:
* - 立即重复问了同样的问题(说明上次没答好)
* - 短时间内结束会话(可能直接放弃了)
* - 用户自己给出了答案(AI没做到)
*
* 数量:多,但信号弱且有噪声
*
* ===== 系统信号(内部可观测)=====
*
* - 检索到的文档和答案的相关性(可以自动评估)
* - 空检索率(没找到任何文档)
* - 超时率、错误率
* - Hallucination检测分数
*/反馈收集与存储
/**
* 用户反馈收集服务
*
* 连接用户行为和系统改善
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class FeedbackCollectionService {
private final JdbcTemplate jdbc;
private final FeedbackAnalysisQueue analysisQueue;
/**
* 收集显式反馈
*/
public void collectExplicitFeedback(ExplicitFeedback feedback) {
// 存储到数据库
jdbc.update("""
INSERT INTO user_feedback
(feedback_id, user_id, session_id, interaction_id,
feedback_type, is_positive, reason_category, reason_text,
query_text, response_text, retrieved_doc_ids, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW())
""",
UUID.randomUUID().toString(),
feedback.getUserId(),
feedback.getSessionId(),
feedback.getInteractionId(),
"EXPLICIT",
feedback.isPositive(),
feedback.getReasonCategory(),
feedback.getReasonText(),
feedback.getQueryText(),
feedback.getResponseText(),
String.join(",", feedback.getRetrievedDocIds() != null ?
feedback.getRetrievedDocIds() : List.of())
);
log.info("显式反馈收集: interactionId={}, positive={}, category={}",
feedback.getInteractionId(), feedback.isPositive(), feedback.getReasonCategory());
// 负面反馈立即触发分析
if (!feedback.isPositive()) {
analysisQueue.enqueue(feedback.getInteractionId());
}
}
/**
* 收集隐式反馈
*
* 在应用层通过事件系统收集用户行为
*/
public void collectImplicitFeedback(String interactionId, ImplicitSignal signal) {
try {
jdbc.update("""
INSERT INTO implicit_signals (interaction_id, signal_type, signal_value, created_at)
VALUES (?, ?, ?, NOW())
ON CONFLICT (interaction_id, signal_type) DO UPDATE
SET signal_value = EXCLUDED.signal_value
""",
interactionId, signal.getType().name(), signal.getValue()
);
} catch (Exception e) {
log.warn("隐式反馈记录失败: {}", e.getMessage());
}
}
/**
* 获取需要分析的负面反馈(用于批量处理)
*/
public List<FeedbackRecord> getUnanalyzedNegativeFeedback(int limit) {
return jdbc.query("""
SELECT f.*, i.retrieved_doc_ids, i.reranked_scores
FROM user_feedback f
LEFT JOIN interaction_details i ON i.interaction_id = f.interaction_id
WHERE f.is_positive = false
AND f.analysis_status = 'PENDING'
ORDER BY f.created_at
LIMIT ?
""",
(rs, rowNum) -> FeedbackRecord.builder()
.feedbackId(rs.getString("feedback_id"))
.interactionId(rs.getString("interaction_id"))
.queryText(rs.getString("query_text"))
.responseText(rs.getString("response_text"))
.reasonCategory(rs.getString("reason_category"))
.reasonText(rs.getString("reason_text"))
.retrievedDocIds(parseDocIds(rs.getString("retrieved_doc_ids")))
.build(),
limit
);
}
private List<String> parseDocIds(String csv) {
if (csv == null || csv.isEmpty()) return List.of();
return List.of(csv.split(","));
}
@Data
@Builder
public static class ExplicitFeedback {
private String userId;
private String sessionId;
private String interactionId;
private boolean positive;
private String reasonCategory; // WRONG_INFO/NOT_HELPFUL/IRRELEVANT/INCOMPLETE/OTHER
private String reasonText;
private String queryText;
private String responseText;
private List<String> retrievedDocIds;
}
@Data
@Builder
public static class ImplicitSignal {
private ImplicitSignalType type;
private String value;
public enum ImplicitSignalType {
CONTENT_COPIED, REPEATED_QUERY, SESSION_ABANDONED_FAST,
FOLLOWUP_DEEPENED, USER_PROVIDED_ANSWER
}
}
@Data
@Builder
public static class FeedbackRecord {
private String feedbackId;
private String interactionId;
private String queryText;
private String responseText;
private String reasonCategory;
private String reasonText;
private List<String> retrievedDocIds;
}
}自动原因分析
/**
* 负面反馈根因分析
*
* 自动判断失败是"检索问题"还是"生成问题"
* 指导后续的修复工作
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class FeedbackRootCauseAnalyzer {
private final ChatLanguageModel llm;
private final VectorStore vectorStore;
private final EmbeddingModel embeddingModel;
private final JdbcTemplate jdbc;
/**
* 分析一条负面反馈的根本原因
*/
public RootCauseAnalysis analyze(FeedbackCollectionService.FeedbackRecord feedback) {
// 步骤1:判断检索结果是否相关
RetrievalQuality retrievalQuality = assessRetrievalQuality(feedback);
// 步骤2:判断生成质量
GenerationQuality generationQuality = assessGenerationQuality(feedback);
// 步骤3:综合诊断
RootCause rootCause = diagnose(retrievalQuality, generationQuality);
// 步骤4:生成具体的改善建议
String improvementAction = generateImprovementAction(rootCause, feedback);
// 更新数据库
jdbc.update("""
UPDATE user_feedback
SET root_cause = ?, improvement_action = ?, analysis_status = 'COMPLETED'
WHERE feedback_id = ?
""",
rootCause.name(), improvementAction, feedback.getFeedbackId()
);
return new RootCauseAnalysis(rootCause, retrievalQuality, generationQuality,
improvementAction);
}
private RetrievalQuality assessRetrievalQuality(FeedbackCollectionService.FeedbackRecord feedback) {
if (feedback.getRetrievedDocIds() == null || feedback.getRetrievedDocIds().isEmpty()) {
return RetrievalQuality.NO_DOCUMENTS_FOUND;
}
// 用LLM判断检索到的文档是否真的相关
String prompt = """
用户问题:%s
检索到的文档:%s
请判断这些文档是否真的包含回答用户问题所需的信息?
返回JSON:
{
"isRelevant": true/false,
"relevanceScore": 0-1,
"reason": "判断理由"
}
只返回JSON。
""".formatted(feedback.getQueryText(), getDocumentsContent(feedback.getRetrievedDocIds()));
try {
String response = llm.generate(prompt);
String json = extractJson(response);
ObjectMapper mapper = new ObjectMapper();
JsonNode root = mapper.readTree(json);
double score = root.path("relevanceScore").asDouble(0.5);
if (score >= 0.7) return RetrievalQuality.GOOD;
if (score >= 0.4) return RetrievalQuality.PARTIALLY_RELEVANT;
return RetrievalQuality.IRRELEVANT;
} catch (Exception e) {
return RetrievalQuality.UNKNOWN;
}
}
private GenerationQuality assessGenerationQuality(FeedbackCollectionService.FeedbackRecord feedback) {
String prompt = """
用户问题:%s
AI的回答:%s
请评估回答质量:
1. 回答是否准确(基于一般知识判断)
2. 回答是否完整
3. 是否存在明显的捏造或错误
返回JSON:
{
"qualityLevel": "GOOD/ACCEPTABLE/POOR",
"issues": ["问题1", "问题2"],
"hasHallucination": true/false
}
只返回JSON。
""".formatted(feedback.getQueryText(), feedback.getResponseText());
try {
String response = llm.generate(prompt);
String json = extractJson(response);
ObjectMapper mapper = new ObjectMapper();
JsonNode root = mapper.readTree(json);
String level = root.path("qualityLevel").asText("ACCEPTABLE");
boolean hallucination = root.path("hasHallucination").asBoolean(false);
if (hallucination) return GenerationQuality.HALLUCINATION;
return switch (level) {
case "GOOD" -> GenerationQuality.GOOD;
case "POOR" -> GenerationQuality.POOR;
default -> GenerationQuality.ACCEPTABLE;
};
} catch (Exception e) {
return GenerationQuality.UNKNOWN;
}
}
private RootCause diagnose(RetrievalQuality retrieval, GenerationQuality generation) {
if (retrieval == RetrievalQuality.NO_DOCUMENTS_FOUND) {
return RootCause.KNOWLEDGE_GAP; // 知识库里没有相关内容
}
if (retrieval == RetrievalQuality.IRRELEVANT) {
return RootCause.RETRIEVAL_FAILURE; // 检索到了,但不相关
}
if (generation == GenerationQuality.HALLUCINATION) {
return RootCause.HALLUCINATION;
}
if (generation == GenerationQuality.POOR) {
return RootCause.GENERATION_QUALITY;
}
if (retrieval == RetrievalQuality.PARTIALLY_RELEVANT) {
return RootCause.RETRIEVAL_INCOMPLETE; // 检索到一些,但不全
}
return RootCause.UNKNOWN;
}
private String generateImprovementAction(
RootCause rootCause, FeedbackCollectionService.FeedbackRecord feedback) {
return switch (rootCause) {
case KNOWLEDGE_GAP ->
"建议添加到知识库:关于'" + feedback.getQueryText() + "'的相关内容";
case RETRIEVAL_FAILURE ->
"检索优化:为查询'" + feedback.getQueryText() + "'添加同义词映射或关键词标签";
case HALLUCINATION ->
"生成优化:对此类问题加强Prompt约束,要求严格基于检索内容回答";
case GENERATION_QUALITY ->
"生成优化:为此类问题类型添加few-shot示例或改进Prompt";
case RETRIEVAL_INCOMPLETE ->
"检索优化:扩大topK或调整检索范围,确保覆盖全面";
default -> "需要人工审查此反馈";
};
}
private String getDocumentsContent(List<String> docIds) {
// 从向量数据库获取文档内容(简化实现)
return docIds.stream()
.limit(3) // 最多看3个文档
.collect(Collectors.joining("\n---\n"));
}
private String extractJson(String s) {
int start = s.indexOf('{'); int end = s.lastIndexOf('}');
return (start >= 0 && end > start) ? s.substring(start, end + 1) : s;
}
public enum RetrievalQuality { GOOD, PARTIALLY_RELEVANT, IRRELEVANT, NO_DOCUMENTS_FOUND, UNKNOWN }
public enum GenerationQuality { GOOD, ACCEPTABLE, POOR, HALLUCINATION, UNKNOWN }
public enum RootCause {
KNOWLEDGE_GAP, // 知识库缺失
RETRIEVAL_FAILURE, // 检索召回失败
RETRIEVAL_INCOMPLETE, // 检索不全
HALLUCINATION, // 模型幻觉
GENERATION_QUALITY, // 生成质量差
UNKNOWN
}
record RootCauseAnalysis(
RootCause rootCause,
RetrievalQuality retrievalQuality,
GenerationQuality generationQuality,
String improvementAction
) {}
}知识库自动补充
/**
* 知识库差距识别和补充服务
*
* 当反馈分析发现KNOWLEDGE_GAP时,
* 自动分析需要补充什么内容
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class KnowledgeGapService {
private final JdbcTemplate jdbc;
private final ChatLanguageModel llm;
/**
* 分析知识库的缺口(基于反馈数据)
*/
public List<KnowledgeGap> analyzeGaps(int days) {
// 获取最近N天的KNOWLEDGE_GAP反馈,按主题聚类
List<Map<String, Object>> rawGaps = jdbc.queryForList("""
SELECT query_text, COUNT(*) as frequency
FROM user_feedback
WHERE root_cause = 'KNOWLEDGE_GAP'
AND created_at > NOW() - INTERVAL '%d days'
GROUP BY query_text
HAVING COUNT(*) >= 2
ORDER BY frequency DESC
LIMIT 50
""".formatted(days));
if (rawGaps.isEmpty()) return List.of();
// 用LLM聚类相似查询,找到真正的知识缺口
List<String> queries = rawGaps.stream()
.map(r -> r.get("query_text").toString())
.toList();
String clusterPrompt = """
以下是用户提问但知识库没有答案的查询列表:
%s
请将这些查询按主题聚类,识别出需要补充的知识点。
返回JSON:
{
"gaps": [
{
"topic": "知识点主题",
"representativeQueries": ["代表性查询1", "查询2"],
"frequency": 出现频次,
"suggestedContent": "建议补充的内容类型(FAQ/文档/规范等)"
}
]
}
只返回JSON。
""".formatted(String.join("\n", queries));
try {
String response = llm.generate(clusterPrompt);
return parseGaps(response);
} catch (Exception e) {
log.error("知识缺口分析失败: {}", e.getMessage());
return List.of();
}
}
/**
* 生成知识补充建议报告(发给内容团队)
*/
public String generateGapReport(List<KnowledgeGap> gaps) {
StringBuilder report = new StringBuilder();
report.append("# 知识库补充建议\n\n");
report.append("基于最近30天的用户反馈分析,发现以下知识缺口:\n\n");
for (int i = 0; i < gaps.size(); i++) {
KnowledgeGap gap = gaps.get(i);
report.append(String.format("## %d. %s(频次:%d)\n\n",
i + 1, gap.topic(), gap.frequency()));
report.append("典型用户问题:\n");
gap.representativeQueries().forEach(q ->
report.append("- ").append(q).append("\n"));
report.append("\n建议内容类型:").append(gap.suggestedContent()).append("\n\n");
}
return report.toString();
}
private List<KnowledgeGap> parseGaps(String response) {
try {
String json = extractJson(response);
ObjectMapper mapper = new ObjectMapper();
JsonNode root = mapper.readTree(json);
List<KnowledgeGap> gaps = new ArrayList<>();
for (JsonNode gap : root.path("gaps")) {
List<String> queries = new ArrayList<>();
for (JsonNode q : gap.path("representativeQueries")) {
queries.add(q.asText());
}
gaps.add(new KnowledgeGap(
gap.path("topic").asText(),
queries,
gap.path("frequency").asInt(1),
gap.path("suggestedContent").asText()
));
}
return gaps;
} catch (Exception e) {
return List.of();
}
}
private String extractJson(String s) {
int start = s.indexOf('{'); int end = s.lastIndexOf('}');
return (start >= 0 && end > start) ? s.substring(start, end + 1) : s;
}
record KnowledgeGap(String topic, List<String> representativeQueries,
int frequency, String suggestedContent) {}
}飞轮效果追踪
/**
* 系统改善效果追踪
*
* 每次做了改善(补充知识库、调整Prompt)之后
* 需要验证改善是否真的有效
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class ImprovementTrackingService {
private final JdbcTemplate jdbc;
/**
* 记录一次改善行动
*/
public void recordImprovement(ImprovementAction action) {
jdbc.update("""
INSERT INTO improvement_log
(action_id, action_type, description, root_cause_addressed,
affected_query_patterns, created_at)
VALUES (?, ?, ?, ?, ?, NOW())
""",
UUID.randomUUID().toString(),
action.getType().name(),
action.getDescription(),
action.getRootCauseAddressed().name(),
String.join(",", action.getAffectedQueryPatterns())
);
log.info("改善行动记录: type={}, description={}", action.getType(), action.getDescription());
}
/**
* 评估改善效果(对比改善前后的满意率)
*/
public ImprovementEffect evaluateEffect(String actionId, int daysAfter) {
// 获取改善时间
LocalDateTime actionTime = jdbc.queryForObject(
"SELECT created_at FROM improvement_log WHERE action_id = ?",
LocalDateTime.class, actionId
);
if (actionTime == null) return ImprovementEffect.notFound();
LocalDateTime beforeStart = actionTime.minusDays(daysAfter);
LocalDateTime afterEnd = actionTime.plusDays(daysAfter);
// 获取改善前后的满意率
double beforeSatisfaction = getSatisfactionRate(beforeStart, actionTime);
double afterSatisfaction = getSatisfactionRate(actionTime, afterEnd);
double improvement = afterSatisfaction - beforeSatisfaction;
log.info("改善效果: actionId={}, before={:.3f}, after={:.3f}, delta={:+.3f}",
actionId, beforeSatisfaction, afterSatisfaction, improvement);
return new ImprovementEffect(beforeSatisfaction, afterSatisfaction, improvement,
improvement > 0.05 ? "EFFECTIVE" :
improvement > 0 ? "SLIGHTLY_EFFECTIVE" : "INEFFECTIVE");
}
private double getSatisfactionRate(LocalDateTime from, LocalDateTime to) {
Double rate = jdbc.queryForObject("""
SELECT AVG(CASE WHEN is_positive THEN 1.0 ELSE 0.0 END)
FROM user_feedback
WHERE created_at BETWEEN ? AND ?
""", Double.class, from, to);
return rate != null ? rate : 0.0;
}
@Data
@Builder
public static class ImprovementAction {
private ImprovementType type;
private String description;
private FeedbackRootCauseAnalyzer.RootCause rootCauseAddressed;
private List<String> affectedQueryPatterns;
public enum ImprovementType {
KNOWLEDGE_ADDED, // 添加了新知识
KNOWLEDGE_UPDATED, // 更新了现有知识
RETRIEVAL_TUNED, // 调整了检索参数
PROMPT_UPDATED, // 更新了Prompt
SYNONYM_ADDED // 添加了同义词映射
}
}
record ImprovementEffect(double beforeSatisfaction, double afterSatisfaction,
double delta, String verdict) {
static ImprovementEffect notFound() {
return new ImprovementEffect(0, 0, 0, "NOT_FOUND");
}
}
}实践建议
负面反馈是宝藏,不是负担
很多团队把用户差评当成"客诉需要处理"的紧急事件,而不是"系统改善的信号源"。我见过一个团队,6个月内收到1200条负面反馈,工程师的处理方式是安抚用户,但从来没有分析过这些反馈背后的模式。实际上,如果系统性地分析这些反馈,可以发现大约30-40%的负面反馈集中在十几类问题上,解决这些高频问题能显著改善整体满意度。
自动分类的准确率不需要100%
用LLM分析负面反馈的根因,准确率可能只有70-80%。这没关系。你需要的不是对每一条反馈都判断准确,而是找到高频出现的问题模式。10条反馈里有7条被正确分类为"检索问题",已经足够你知道该优化检索了。定期抽样人工复核自动分类结果,保持方向的正确性。
改善-验证的闭环必须量化
每次做了一个改善(比如补充了100篇FAQ文档),都要明确记录:
- 改善了什么(目标是解决什么问题)
- 期望的效果(比如涉及这个主题的满意率从60%提升到75%)
- 实际测量时间(改善后7天后的满意率)
没有量化的改善追踪,团队很快就会不知道"有没有在进步",飞轮就停了。
