第1670篇:RAG系统的缓存架构——语义缓存的设计与命中率优化
第1670篇:RAG系统的缓存架构——语义缓存的设计与命中率优化
有个数字我刚开始做RAG的时候没想到:在真实生产环境里,相似甚至相同的问题被重复问的比例,高得出奇。
我们分析过一个内部知识库的查询日志,2000条问题去重之后只剩1100条不同的问题。换句话说,将近一半的查询是重复的。
更有意思的是,即使不完全重复,很多查询语义上是一样的:
- "怎么申请年假" 和 "年假怎么请" 和 "请年假的流程是什么"
这三个问题的答案完全一样,但如果每次都走完整的RAG流程(检索→LLM生成),就会重复调用API、重复消耗Token、重复等待。
缓存是最直接的优化手段。但RAG的缓存不能用普通的字符串精确匹配——"怎么申请年假"和"年假怎么请"的字符串完全不同,传统缓存命不中。
这就是语义缓存要解决的核心问题:对语义相似的查询命中同一个缓存条目。
一、语义缓存的基本原理
语义缓存的核心思路很简单:
- 把历史查询的嵌入向量存到向量数据库里
- 新查询进来,先计算其嵌入向量
- 在缓存向量库里找最近邻,如果相似度超过阈值,直接返回对应的缓存结果
- 如果没有命中,正常走RAG流程,结果写入缓存
二、语义缓存核心实现
2.1 基础语义缓存服务
@Service
public class SemanticCacheService {
@Autowired
private EmbeddingModel embeddingModel;
@Autowired
private VectorStore cacheVectorStore; // 专门用于缓存的向量库(独立于知识库)
@Autowired
private RedisTemplate<String, SemanticCacheEntry> redisTemplate;
// 语义相似度阈值:超过这个值才认为是命中缓存
// 这个值需要根据场景调整,后面会讲调优方法
private static final double SIMILARITY_THRESHOLD = 0.92;
// 缓存TTL:不同类型的问题TTL不同
private static final Duration DEFAULT_TTL = Duration.ofHours(24);
private static final Duration VOLATILE_TTL = Duration.ofMinutes(30); // 时效性强的问题
/**
* 语义缓存查询
* 返回空Optional表示缓存未命中
*/
public Optional<CacheHitResult> get(String query) {
long startTime = System.currentTimeMillis();
try {
// 1. 向量化当前查询
float[] queryEmbedding = toFloatArray(embeddingModel.embed(query).getOutput());
// 2. 在缓存向量库里找最相似的历史查询
List<Document> similar = cacheVectorStore.similaritySearch(
SearchRequest.query(query)
.withTopK(1)
.withSimilarityThreshold(SIMILARITY_THRESHOLD)
);
if (similar.isEmpty()) {
metrics.recordCacheMiss(System.currentTimeMillis() - startTime);
return Optional.empty();
}
Document closestQuery = similar.get(0);
double similarity = closestQuery.getScore();
// 3. 根据缓存键从Redis取实际结果
String cacheKey = (String) closestQuery.getMetadata().get("cache_key");
SemanticCacheEntry entry = redisTemplate.opsForValue().get(cacheKey);
if (entry == null) {
// 向量库有记录但Redis过期了,清理向量库记录
cacheVectorStore.delete(List.of(closestQuery.getId()));
metrics.recordCacheMiss(System.currentTimeMillis() - startTime);
return Optional.empty();
}
// 命中!
metrics.recordCacheHit(similarity, System.currentTimeMillis() - startTime);
log.debug("语义缓存命中:query='{}', cachedQuery='{}', similarity={:.3f}",
query, entry.getOriginalQuery(), similarity);
return Optional.of(CacheHitResult.builder()
.answer(entry.getAnswer())
.sourceDocs(entry.getSourceDocs())
.similarity(similarity)
.originalCachedQuery(entry.getOriginalQuery())
.cacheTime(entry.getCacheTime())
.build());
} catch (Exception e) {
log.error("语义缓存查询失败", e);
metrics.recordCacheError();
return Optional.empty(); // 失败了就当没缓存,继续正常流程
}
}
/**
* 写入语义缓存
*/
public void put(String query, String answer, List<Document> sourceDocs,
CachePolicy policy) {
try {
// 生成缓存键
String cacheKey = "semantic:" + DigestUtils.md5Hex(query);
// 判断TTL
Duration ttl = determineTTL(query, policy);
// 1. 在Redis里存储完整的结果
SemanticCacheEntry entry = SemanticCacheEntry.builder()
.originalQuery(query)
.answer(answer)
.sourceDocs(sourceDocs.stream()
.map(d -> SourceDocSummary.of(d.getId(), d.getMetadata()))
.collect(Collectors.toList()))
.cacheTime(LocalDateTime.now())
.ttlSeconds(ttl.toSeconds())
.build();
redisTemplate.opsForValue().set(cacheKey, entry, ttl);
// 2. 在向量库里存储查询向量(用于相似度匹配)
Document cacheDoc = Document.builder()
.id(cacheKey + "_vec")
.content(query) // 内容是查询文本,用于向量化
.metadata(Map.of(
"cache_key", cacheKey,
"query_hash", DigestUtils.md5Hex(query),
"cache_time", LocalDateTime.now().toString(),
"ttl_seconds", ttl.toSeconds()
))
.build();
cacheVectorStore.add(List.of(cacheDoc));
log.debug("语义缓存写入:query='{}', cacheKey={}, ttl={}s",
query, cacheKey, ttl.toSeconds());
} catch (Exception e) {
log.error("语义缓存写入失败", e);
// 写入失败不影响主流程
}
}
/**
* 根据查询内容判断TTL策略
* 时效性强的问题用短TTL
*/
private Duration determineTTL(String query, CachePolicy policy) {
if (policy == CachePolicy.VOLATILE) return VOLATILE_TTL;
if (policy == CachePolicy.PERSISTENT) return Duration.ofDays(7);
// 自动判断:包含时间词的问题用短TTL
List<String> timeKeywords = Arrays.asList(
"今天", "最新", "当前", "现在", "最近", "本周", "本月",
"最新版", "当前版本", "实时"
);
for (String kw : timeKeywords) {
if (query.contains(kw)) return VOLATILE_TTL;
}
return DEFAULT_TTL;
}
private float[] toFloatArray(List<Double> list) {
float[] arr = new float[list.size()];
for (int i = 0; i < list.size(); i++) arr[i] = list.get(i).floatValue();
return arr;
}
@Autowired
private MetricsService metrics;
}2.2 带缓存的RAG服务封装
@Service
public class CachedRAGService {
@Autowired
private SemanticCacheService semanticCache;
@Autowired
private RAGPipeline ragPipeline;
@Autowired
private QueryClassifier queryClassifier;
/**
* 带语义缓存的RAG查询
*/
public RAGResponse query(String question, QueryContext context) {
// 1. 判断是否适合缓存
Cacheability cacheability = assessCacheability(question, context);
if (cacheability == Cacheability.NOT_CACHEABLE) {
// 某些问题不能缓存(如涉及用户个人信息的个性化问题)
return ragPipeline.execute(question, context);
}
// 2. 查询语义缓存
Optional<CacheHitResult> cacheHit = semanticCache.get(question);
if (cacheHit.isPresent()) {
CacheHitResult hit = cacheHit.get();
// 返回缓存结果,带缓存标识
return RAGResponse.builder()
.answer(hit.getAnswer())
.fromCache(true)
.cacheSimilarity(hit.getSimilarity())
.cachedQuery(hit.getOriginalCachedQuery())
.cacheTime(hit.getCacheTime())
.build();
}
// 3. 缓存未命中,走正常RAG流程
RAGResponse response = ragPipeline.execute(question, context);
// 4. 将结果写入缓存
if (response.isSuccessful()) {
CachePolicy policy = determineCachePolicy(question, cacheability);
semanticCache.put(question, response.getAnswer(),
response.getSourceDocs(), policy);
}
return response;
}
/**
* 判断问题是否适合缓存
*/
private Cacheability assessCacheability(String question, QueryContext context) {
// 个性化问题不能缓存(包含"我的"、"我"等第一人称,且上下文有用户信息)
if (question.contains("我的") && context.hasUserContext()) {
return Cacheability.NOT_CACHEABLE;
}
// 实时数据查询不能缓存(实时价格、库存等)
List<String> realtimeKeywords = Arrays.asList("实时", "当前库存", "今日价格");
for (String kw : realtimeKeywords) {
if (question.contains(kw)) return Cacheability.NOT_CACHEABLE;
}
// 时效性强的问题,短期缓存
List<String> volatileKeywords = Arrays.asList("最新", "最近", "今天", "本周");
for (String kw : volatileKeywords) {
if (question.contains(kw)) return Cacheability.VOLATILE;
}
return Cacheability.STABLE;
}
private CachePolicy determineCachePolicy(String question, Cacheability cacheability) {
switch (cacheability) {
case VOLATILE: return CachePolicy.VOLATILE;
case STABLE: return CachePolicy.DEFAULT;
default: return CachePolicy.DEFAULT;
}
}
}三、相似度阈值的调优方法
相似度阈值是语义缓存最关键的超参数。设太高,命中率低,缓存作用不大;设太低,把语义不同的问题误判为相同,返回错误答案。
@Service
public class CacheThresholdOptimizer {
@Autowired
private QueryLogRepository queryLogRepo;
@Autowired
private EmbeddingModel embeddingModel;
@Autowired
private LLMClient llmClient;
/**
* 基于历史数据分析,找最优阈值
* 核心:构建"语义相似但答案不同"和"语义相似且答案相同"的查询对
*/
public ThresholdAnalysisResult analyzeOptimalThreshold() {
// 1. 采样历史查询对
List<QueryPair> queryPairs = sampleQueryPairs(500);
// 2. 计算每对查询的语义相似度
List<AnnotatedPair> annotatedPairs = new ArrayList<>();
for (QueryPair pair : queryPairs) {
float[] emb1 = toFloatArray(embeddingModel.embed(pair.getQuery1()).getOutput());
float[] emb2 = toFloatArray(embeddingModel.embed(pair.getQuery2()).getOutput());
double similarity = cosineSimilarity(emb1, emb2);
// 判断两个查询的答案是否"语义等价"(用LLM判断)
boolean answerEquivalent = judgeAnswerEquivalence(
pair.getQuery1(), pair.getAnswer1(),
pair.getQuery2(), pair.getAnswer2()
);
annotatedPairs.add(AnnotatedPair.builder()
.query1(pair.getQuery1())
.query2(pair.getQuery2())
.similarity(similarity)
.answerEquivalent(answerEquivalent)
.build());
}
// 3. 在不同阈值下计算精确率和召回率
List<Double> thresholds = new ArrayList<>();
for (double t = 0.7; t <= 1.0; t += 0.01) {
thresholds.add(t);
}
Map<Double, PrecisionRecall> prCurve = new HashMap<>();
for (double threshold : thresholds) {
final double t = threshold;
long truePositive = annotatedPairs.stream()
.filter(p -> p.getSimilarity() >= t && p.isAnswerEquivalent()).count();
long falsePositive = annotatedPairs.stream()
.filter(p -> p.getSimilarity() >= t && !p.isAnswerEquivalent()).count();
long falseNegative = annotatedPairs.stream()
.filter(p -> p.getSimilarity() < t && p.isAnswerEquivalent()).count();
double precision = (truePositive + falsePositive) == 0 ? 1.0 :
(double) truePositive / (truePositive + falsePositive);
double recall = (truePositive + falseNegative) == 0 ? 0.0 :
(double) truePositive / (truePositive + falseNegative);
prCurve.put(threshold, PrecisionRecall.of(precision, recall));
}
// 4. 找F1最大的阈值(或按业务要求优先保证精确率)
double optimalThreshold = thresholds.stream()
.max(Comparator.comparingDouble(t -> {
PrecisionRecall pr = prCurve.get(t);
// 业务上更不能接受"缓存命中但答案错误"
// 所以精确率权重更高:F_beta with beta=0.5
double beta = 0.5;
return (1 + beta * beta) * pr.getPrecision() * pr.getRecall() /
(beta * beta * pr.getPrecision() + pr.getRecall() + 1e-10);
}))
.orElse(0.92);
return ThresholdAnalysisResult.builder()
.optimalThreshold(optimalThreshold)
.prCurve(prCurve)
.sampleSize(annotatedPairs.size())
.build();
}
private boolean judgeAnswerEquivalence(String q1, String a1, String q2, String a2) {
String prompt = String.format("""
判断以下两个问答对是否在语义上等价(即用户的信息需求相同,答案可以互用)。
问答1:
问:%s
答:%s(节选)
问答2:
问:%s
答:%s(节选)
只回答"yes"或"no"。
""", q1, a1.substring(0, Math.min(200, a1.length())),
q2, a2.substring(0, Math.min(200, a2.length())));
return llmClient.chat(prompt).trim().toLowerCase().startsWith("yes");
}
private List<QueryPair> sampleQueryPairs(int count) {
// 从查询日志中采样,构建查询对
List<QueryLog> logs = queryLogRepo.sampleRecent(count * 3);
List<QueryPair> pairs = new ArrayList<>();
Random random = new Random(42);
for (int i = 0; i < count; i++) {
int idx1 = random.nextInt(logs.size());
int idx2 = random.nextInt(logs.size());
if (idx1 != idx2) {
pairs.add(QueryPair.of(
logs.get(idx1).getQuery(), logs.get(idx1).getAnswer(),
logs.get(idx2).getQuery(), logs.get(idx2).getAnswer()
));
}
}
return pairs;
}
private double cosineSimilarity(float[] a, float[] b) {
double dot = 0, normA = 0, normB = 0;
for (int i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}
private float[] toFloatArray(List<Double> list) {
float[] arr = new float[list.size()];
for (int i = 0; i < list.size(); i++) arr[i] = list.get(i).floatValue();
return arr;
}
}四、缓存失效策略
缓存的核心挑战之一是失效:知识库更新后,缓存的答案可能过时。
@Service
public class CacheInvalidationService {
@Autowired
private VectorStore cacheVectorStore;
@Autowired
private RedisTemplate<String, Object> redisTemplate;
@Autowired
private EmbeddingModel embeddingModel;
/**
* 当知识库某个文档更新时,失效相关的缓存
* 策略:找出使用了该文档的所有缓存条目,删除它们
*/
public int invalidateCacheByDocument(String docId) {
// 1. 找出所有缓存条目中引用了该文档的
String pattern = "semantic:*";
Set<String> allCacheKeys = redisTemplate.keys(pattern);
if (allCacheKeys == null || allCacheKeys.isEmpty()) return 0;
int invalidatedCount = 0;
for (String cacheKey : allCacheKeys) {
SemanticCacheEntry entry = (SemanticCacheEntry)
redisTemplate.opsForValue().get(cacheKey);
if (entry != null) {
boolean usedDoc = entry.getSourceDocs().stream()
.anyMatch(sd -> docId.equals(sd.getDocId()));
if (usedDoc) {
redisTemplate.delete(cacheKey);
// 同时删除向量库中的对应记录
cacheVectorStore.delete(List.of(cacheKey + "_vec"));
invalidatedCount++;
}
}
}
log.info("文档 {} 更新,失效了 {} 条缓存", docId, invalidatedCount);
return invalidatedCount;
}
/**
* 主动缓存预热:对高频查询提前计算并缓存
*/
public void warmUpCache(List<String> frequentQueries) {
log.info("开始缓存预热,共{}个高频查询", frequentQueries.size());
for (String query : frequentQueries) {
// 检查是否已缓存
Optional<CacheHitResult> existing = semanticCacheService.get(query);
if (existing.isPresent()) {
log.debug("查询已缓存,跳过:{}", query);
continue;
}
try {
// 执行RAG,结果自动写入缓存
RAGResponse response = cachedRAGService.query(query, QueryContext.empty());
log.debug("预热完成:{}", query);
// 避免请求过快
Thread.sleep(100);
} catch (Exception e) {
log.warn("预热失败:{}", query, e);
}
}
log.info("缓存预热完成");
}
/**
* 定期清理过期的向量库记录(Redis TTL到期后,向量库的记录需要同步清理)
*/
@Scheduled(cron = "0 0 3 * * ?") // 每天凌晨3点
public void cleanupExpiredVectorEntries() {
log.info("开始清理过期缓存向量");
// 搜索所有缓存向量(使用空查询或通配符)
// 这里简化处理,实际中可能需要分页处理
List<Document> allCacheEntries = cacheVectorStore.similaritySearch(
SearchRequest.query("").withTopK(10000)
);
List<String> toDelete = allCacheEntries.stream()
.filter(doc -> {
String cacheKey = (String) doc.getMetadata().get("cache_key");
return cacheKey != null && !redisTemplate.hasKey(cacheKey);
})
.map(Document::getId)
.collect(Collectors.toList());
if (!toDelete.isEmpty()) {
cacheVectorStore.delete(toDelete);
log.info("清理过期缓存向量:{}条", toDelete.size());
}
}
@Autowired
private SemanticCacheService semanticCacheService;
@Autowired
private CachedRAGService cachedRAGService;
}五、缓存监控与分析
@Component
public class CacheMetricsCollector {
@Autowired
private MeterRegistry meterRegistry;
private final Counter hitCounter;
private final Counter missCounter;
private final Counter errorCounter;
private final Timer hitLatencyTimer;
private final Timer missLatencyTimer;
private final DistributionSummary similarityDistribution;
public CacheMetricsCollector(MeterRegistry meterRegistry) {
this.hitCounter = Counter.builder("semantic_cache_requests")
.tag("result", "hit")
.register(meterRegistry);
this.missCounter = Counter.builder("semantic_cache_requests")
.tag("result", "miss")
.register(meterRegistry);
this.errorCounter = Counter.builder("semantic_cache_requests")
.tag("result", "error")
.register(meterRegistry);
this.hitLatencyTimer = Timer.builder("semantic_cache_latency")
.tag("type", "hit")
.register(meterRegistry);
this.missLatencyTimer = Timer.builder("semantic_cache_latency")
.tag("type", "miss")
.register(meterRegistry);
this.similarityDistribution = DistributionSummary.builder("cache_hit_similarity")
.register(meterRegistry);
}
public void recordCacheHit(double similarity, long latencyMs) {
hitCounter.increment();
hitLatencyTimer.record(latencyMs, TimeUnit.MILLISECONDS);
similarityDistribution.record(similarity);
}
public void recordCacheMiss(long latencyMs) {
missCounter.increment();
missLatencyTimer.record(latencyMs, TimeUnit.MILLISECONDS);
}
public void recordCacheError() {
errorCounter.increment();
}
/**
* 计算缓存命中率(最近1小时)
*/
public CacheStats getRecentStats() {
double totalHits = hitCounter.count();
double totalMisses = missCounter.count();
double total = totalHits + totalMisses;
return CacheStats.builder()
.hitRate(total == 0 ? 0 : totalHits / total)
.totalRequests((long) total)
.avgHitLatencyMs(hitLatencyTimer.mean(TimeUnit.MILLISECONDS))
.avgMissLatencyMs(missLatencyTimer.mean(TimeUnit.MILLISECONDS))
.avgHitSimilarity(similarityDistribution.mean())
.build();
}
}六、多层缓存架构
在高并发场景下,单层语义缓存可能不够,可以构建多层缓存:
@Service
public class MultiLayerCacheRAGService {
@Autowired
private RedisTemplate<String, String> exactCacheRedis; // L1
@Autowired
private SemanticCacheService semanticCache; // L2
@Autowired
private RAGPipeline localRAG; // L3
@Autowired
private EnhancedRAGPipeline enhancedRAG; // L4
@Autowired
private MetricsService metrics;
public RAGResponse queryMultiLayer(String question) {
// L1: 精确缓存(完全相同的查询)
String exactKey = "exact:" + DigestUtils.md5Hex(question);
String cachedAnswer = exactCacheRedis.opsForValue().get(exactKey);
if (cachedAnswer != null) {
metrics.recordLayerHit("L1");
return RAGResponse.fromCache(cachedAnswer, "L1");
}
// L2: 语义缓存
Optional<CacheHitResult> semanticHit = semanticCache.get(question);
if (semanticHit.isPresent()) {
metrics.recordLayerHit("L2");
// L2命中后,回写L1缓存
exactCacheRedis.opsForValue().set(
exactKey, semanticHit.get().getAnswer(),
Duration.ofMinutes(30)
);
return RAGResponse.fromCache(semanticHit.get().getAnswer(), "L2");
}
// L3: 本地RAG
RAGResponse localResult = localRAG.execute(question);
if (localResult.getConfidence() >= 0.7) {
metrics.recordLayerHit("L3");
// 回写L1和L2
writeBackToCache(question, localResult, exactKey);
return localResult;
}
// L4: 增强RAG(最重)
RAGResponse enhancedResult = enhancedRAG.execute(question);
metrics.recordLayerHit("L4");
// 如果增强RAG质量足够好,也写入缓存
if (enhancedResult.getConfidence() >= 0.7) {
writeBackToCache(question, enhancedResult, exactKey);
}
return enhancedResult;
}
private void writeBackToCache(String question, RAGResponse response, String exactKey) {
// 写L1
exactCacheRedis.opsForValue().set(exactKey, response.getAnswer(),
Duration.ofHours(1));
// 写L2
semanticCache.put(question, response.getAnswer(),
response.getSourceDocs(), CachePolicy.DEFAULT);
}
}七、实际效果数据
在我们的企业知识库项目(月活1000人左右,日均查询1万次)部署语义缓存之后:
| 指标 | 部署前 | 部署后 |
|---|---|---|
| P50延迟 | 850ms | 95ms(缓存命中后) |
| P95延迟 | 2200ms | 480ms(命中+未命中平均) |
| LLM API调用量 | 10,000次/天 | 4,200次/天(-58%) |
| 月度API成本 | 基准 | -55% |
| 语义缓存命中率 | - | 47% |
命中率47%这个数字意味着将近一半的请求不需要调用LLM,直接从缓存返回。延迟从850ms降到95ms,这对用户体验是质的提升。
关于命中率的现实预期:
- 内部知识库(用户群体固定,查询相对集中):40%-60%命中率是合理的
- 公开通用问答(用户查询多样):10%-25%
- FAQ类应用(问题集合固定):可以到80%+
八、踩过的坑
坑1:向量库选型要和主知识库分开
我们最开始把缓存向量也存在同一个知识库里,导致缓存查询和知识查询互相干扰,缓存命中错误的情况时有发生。
现在标准做法是:缓存向量和知识向量用不同的Collection(或者不同的数据库实例),完全隔离。
坑2:阈值太低的后果很严重
阈值设0.85的时候,有几次出现了"把不同部门的请假流程答案返回给错误部门的用户"这种问题——"研发部门怎么请假"和"销售部门怎么请假"的语义相似度可能达到0.87,但答案不一样。
在有部门区分的场景里,应该把部门信息也纳入缓存键,或者在过滤条件里加上部门限制。
坑3:缓存预热不能无脑预热
对所有"历史高频"问题预热,但有一部分高频问题是时效性强的("最新系统版本是什么"),预热了之后反而会长期返回过时信息。预热前必须过滤时效性强的问题。
这是RAG系列的第10篇。从幻觉分析、质量评估、图谱集成、自适应路由、多跳推理、纠错机制、文档处理、向量库选型、混合检索,到今天的缓存架构,我们把一个生产级RAG系统的核心模块基本都覆盖了。
RAG工程化是一个系统工程,每个环节都值得深入打磨。真正让系统好用的不是单一的技术,而是把所有环节都做到位。
