第2108篇:LLM应用的缓存策略——语义缓存与精确缓存的工程实践
2026/4/30大约 12 分钟
第2108篇:LLM应用的缓存策略——语义缓存与精确缓存的工程实践
适读人群:优化LLM应用成本和延迟的工程师 | 阅读时长:约19分钟 | 核心价值:掌握语义缓存的核心原理和落地方案,显著降低LLM调用成本和响应延迟
LLM调用的两个核心痛点:慢和贵。
一次GPT-4的调用动辄500ms到2秒,在高并发场景下直接成为瓶颈。成本更不用说,一个中等规模的RAG应用,每月的API费用很容易超过服务器成本。
缓存是最直接的优化手段,但LLM的缓存和普通HTTP接口的缓存有本质区别:用户输入永远不会完全重复。"今天天气怎么样"和"今天的天气如何"是两个不同的字符串,但语义几乎相同,应该返回同一个答案。
这就是语义缓存(Semantic Cache)的核心思想。
缓存类型概览
/**
* LLM应用中的缓存层次
*
* Level 1:精确缓存(Exact Cache)
* - 完全相同的请求(相同Prompt + 相同参数)返回缓存结果
* - 适合场景:文档摘要、固定模板生成、批量处理
* - 命中率:低(用户交互场景几乎没用)
*
* Level 2:语义缓存(Semantic Cache)
* - 语义相似的请求共享同一个缓存结果
* - 适合场景:FAQ问答、搜索、通用问答助手
* - 命中率:中高(相似问题很多)
*
* Level 3:会话级缓存(Session-level Cache)
* - 同一会话内,相同上下文的问题命中缓存
* - 适合场景:多轮对话中的重复查询
*
* Level 4:KV缓存(KV Cache / Prefix Cache)
* - 复用相同System Prompt前缀的计算
* - 适合场景:长System Prompt的批量处理
* - 注意:这是LLM服务层的优化,不是应用层
*
* 本文重点讲Level 1-3,Level 4由LLM服务商提供
*/精确缓存实现
/**
* 精确缓存器
*
* 对完全相同的请求(Prompt哈希相同)直接返回缓存
* 这层缓存实现简单,但对交互式场景基本没用
* 更适合批处理、固定模板、CI/CD中的重复推理
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class ExactCacheService {
private final RedisTemplate<String, String> redisTemplate;
private final ObjectMapper objectMapper;
private static final String CACHE_PREFIX = "llm:exact:";
private static final Duration DEFAULT_TTL = Duration.ofHours(24);
/**
* 生成缓存key
*
* 包含:模型名、所有消息内容、temperature等影响输出的参数
* 不包含:userId(缓存应该对所有用户复用)
*/
public String generateCacheKey(LlmRequest request) {
// 构建影响输出的因素集合
Map<String, Object> keyComponents = new LinkedHashMap<>();
keyComponents.put("model", request.getModel());
keyComponents.put("temperature", request.getTemperature());
keyComponents.put("maxTokens", request.getMaxTokens());
// 消息内容按顺序哈希
List<String> messageHashes = request.getMessages().stream()
.map(m -> m.getRole() + ":" + m.getContent())
.toList();
keyComponents.put("messages", messageHashes);
try {
String json = objectMapper.writeValueAsString(keyComponents);
byte[] hash = java.security.MessageDigest
.getInstance("SHA-256")
.digest(json.getBytes(java.nio.charset.StandardCharsets.UTF_8));
return CACHE_PREFIX + bytesToHex(hash);
} catch (Exception e) {
throw new RuntimeException("缓存key生成失败", e);
}
}
public Optional<String> get(String cacheKey) {
try {
String cached = redisTemplate.opsForValue().get(cacheKey);
if (cached != null) {
log.debug("精确缓存命中: key={}", cacheKey);
return Optional.of(cached);
}
} catch (Exception e) {
log.warn("缓存读取失败: {}", e.getMessage());
}
return Optional.empty();
}
public void put(String cacheKey, String response, Duration ttl) {
try {
redisTemplate.opsForValue().set(cacheKey, response, ttl);
} catch (Exception e) {
log.warn("缓存写入失败: {}", e.getMessage());
// 缓存失败不影响主流程
}
}
public void put(String cacheKey, String response) {
put(cacheKey, response, DEFAULT_TTL);
}
private String bytesToHex(byte[] bytes) {
StringBuilder sb = new StringBuilder();
for (byte b : bytes) {
sb.append(String.format("%02x", b));
}
return sb.toString();
}
}语义缓存核心实现
/**
* 语义缓存服务
*
* 核心思路:
* 1. 把历史查询和其对应的答案存入向量数据库
* 2. 新查询进来时,搜索相似的历史查询
* 3. 如果相似度超过阈值,直接返回历史答案
*
* 关键参数:相似度阈值(similarity threshold)
* - 太高(0.99+):命中率低,几乎等于精确匹配
* - 太低(0.7以下):误命中率高,返回不相关答案
* - 推荐范围:0.92-0.96(经验值,根据场景调整)
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class SemanticCacheService {
private final EmbeddingModel embeddingModel;
private final EmbeddingStore<TextSegment> embeddingStore;
private final RedisTemplate<String, String> redisTemplate;
private final ObjectMapper objectMapper;
// 语义相似度阈值,高于此值认为是相似查询
@Value("${llm.cache.semantic.threshold:0.93}")
private double similarityThreshold;
// 语义缓存的TTL(语义缓存通常比精确缓存TTL更短,因为答案可能过时)
@Value("${llm.cache.semantic.ttl-hours:4}")
private int ttlHours;
private static final String CACHE_META_PREFIX = "llm:semantic:meta:";
/**
* 查询语义缓存
*
* @param query 当前用户问题
* @param context 可选的上下文(影响答案的关键信息,比如用户角色)
* @return 如果命中缓存则返回答案,否则返回empty
*/
public Optional<SemanticCacheHit> lookup(String query, CacheContext context) {
try {
// 1. 向量化当前查询
float[] queryVector = embeddingModel.embed(query).content().vector();
// 2. 在向量数据库中搜索相似的历史查询
// 带过滤条件:相同的上下文(比如同一个知识库范围、同一个用户角色)
EmbeddingSearchRequest searchRequest = EmbeddingSearchRequest.builder()
.queryEmbedding(Embedding.from(queryVector))
.maxResults(3) // 取最相似的3个候选
.minScore(similarityThreshold) // 只返回超过阈值的
.filter(buildContextFilter(context)) // 上下文过滤
.build();
EmbeddingSearchResult<TextSegment> results = embeddingStore.search(searchRequest);
if (results.matches().isEmpty()) {
return Optional.empty();
}
// 3. 取最高分的匹配结果
EmbeddingMatch<TextSegment> bestMatch = results.matches().get(0);
double score = bestMatch.score();
String cacheEntryId = bestMatch.embedded().metadata().getString("cacheEntryId");
// 4. 从Redis获取完整的缓存答案(向量DB只存查询,答案存在Redis)
String metaKey = CACHE_META_PREFIX + cacheEntryId;
String cachedMeta = redisTemplate.opsForValue().get(metaKey);
if (cachedMeta == null) {
// Redis里的答案已过期,但向量DB里的entry还在
// 需要异步清理向量DB(或者直接忽略,等下次维护)
log.debug("语义缓存向量命中但答案已过期: entryId={}", cacheEntryId);
return Optional.empty();
}
CacheEntry entry = objectMapper.readValue(cachedMeta, CacheEntry.class);
log.debug("语义缓存命中: query='{}', matchedQuery='{}', score={}",
query, entry.originalQuery(), score);
return Optional.of(new SemanticCacheHit(
entry.answer(), entry.originalQuery(), score, cacheEntryId
));
} catch (Exception e) {
// 缓存查询失败不影响主流程,降级走LLM
log.warn("语义缓存查询失败: {}", e.getMessage());
return Optional.empty();
}
}
/**
* 存入语义缓存
*
* @param query 原始查询
* @param answer LLM的回答
* @param context 上下文
*/
public void store(String query, String answer, CacheContext context) {
try {
String cacheEntryId = UUID.randomUUID().toString();
// 1. 向量化查询并存入向量DB
float[] queryVector = embeddingModel.embed(query).content().vector();
Metadata metadata = new Metadata();
metadata.put("cacheEntryId", cacheEntryId);
metadata.put("context", context.toFilterString());
metadata.put("timestamp", String.valueOf(System.currentTimeMillis()));
TextSegment segment = TextSegment.from(query, metadata);
embeddingStore.add(Embedding.from(queryVector), segment);
// 2. 把查询+答案的元数据存入Redis
CacheEntry entry = new CacheEntry(cacheEntryId, query, answer,
System.currentTimeMillis());
redisTemplate.opsForValue().set(
CACHE_META_PREFIX + cacheEntryId,
objectMapper.writeValueAsString(entry),
Duration.ofHours(ttlHours)
);
log.debug("语义缓存存储: query='{}', entryId={}", query, cacheEntryId);
} catch (Exception e) {
log.warn("语义缓存存储失败: {}", e.getMessage());
}
}
/**
* 构建上下文过滤器
*
* 不同场景的问题不应该互相命中缓存
* 比如:客服机器人的缓存不应该被技术文档问答命中
*/
private dev.langchain4j.store.embedding.filter.Filter buildContextFilter(CacheContext context) {
if (context == null || context.getScope() == null) {
return null;
}
// 只搜索相同scope的缓存
return dev.langchain4j.store.embedding.filter.MetadataFilterBuilder
.metadataKey("context")
.isEqualTo(context.toFilterString());
}
@Data
@Builder
public static class CacheContext {
private String scope; // 使用场景(customer_service/tech_qa/product_search等)
private String language; // 语言(zh/en)
public String toFilterString() {
return scope + ":" + (language != null ? language : "zh");
}
}
public record SemanticCacheHit(
String answer, String originalQuery, double similarity, String cacheEntryId
) {}
record CacheEntry(
String entryId, String originalQuery, String answer, long createdAt
) {}
}缓存策略配置和动态调整
/**
* 自适应缓存阈值管理
*
* 问题:不同领域的问答对相似度阈值的需求不同
* - 数学/代码问题:必须非常精确,阈值要高(0.97+)
* - FAQ/常见问题:可以更宽松(0.90-0.93)
* - 创意写作:通常不适合缓存(每次答案都应该不同)
*
* 解决方案:按场景配置不同阈值,并根据误命中率动态调整
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class CacheStrategyManager {
private final RedisTemplate<String, String> redisTemplate;
// 各场景的缓存配置
private final Map<String, CacheStrategyConfig> strategyConfigs = new ConcurrentHashMap<>(Map.of(
"customer_service_faq", CacheStrategyConfig.builder()
.similarityThreshold(0.92)
.ttlHours(8)
.enabled(true)
.build(),
"technical_qa", CacheStrategyConfig.builder()
.similarityThreshold(0.96)
.ttlHours(24)
.enabled(true)
.build(),
"code_generation", CacheStrategyConfig.builder()
.similarityThreshold(0.99) // 代码生成几乎要求精确匹配
.ttlHours(48)
.enabled(true)
.build(),
"creative_writing", CacheStrategyConfig.builder()
.similarityThreshold(1.1) // 大于1表示禁用(相似度不可能超过1)
.ttlHours(0)
.enabled(false) // 创意写作直接禁用缓存
.build()
));
/**
* 记录缓存命中反馈
*
* 当用户明确表示"这个答案不对"时,说明缓存发生了误命中
* 可以据此动态调高阈值
*/
public void recordNegativeFeedback(String scope, String cacheEntryId, String reason) {
String feedbackKey = "cache:negative_feedback:" + scope;
long feedbackCount = redisTemplate.opsForValue().increment(feedbackKey);
redisTemplate.expire(feedbackKey, Duration.ofHours(24));
log.warn("缓存误命中反馈: scope={}, entryId={}, reason={}, count24h={}",
scope, cacheEntryId, reason, feedbackCount);
// 如果24小时内超过10次负反馈,自动调高阈值
if (feedbackCount >= 10) {
adjustThresholdUp(scope);
}
}
private void adjustThresholdUp(String scope) {
CacheStrategyConfig config = strategyConfigs.get(scope);
if (config == null) return;
double currentThreshold = config.getSimilarityThreshold();
double newThreshold = Math.min(0.99, currentThreshold + 0.02);
config.setSimilarityThreshold(newThreshold);
log.info("自动调高缓存阈值: scope={}, {} -> {}", scope, currentThreshold, newThreshold);
}
public CacheStrategyConfig getStrategy(String scope) {
return strategyConfigs.getOrDefault(scope,
CacheStrategyConfig.builder()
.similarityThreshold(0.93)
.ttlHours(4)
.enabled(true)
.build());
}
@Data
@Builder
public static class CacheStrategyConfig {
private double similarityThreshold;
private int ttlHours;
private boolean enabled;
}
}完整的缓存代理层
/**
* LLM缓存代理
*
* 把精确缓存、语义缓存、以及原始LLM调用统一封装
* 调用方只需要用这一个类
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class CachedLlmService {
private final ExactCacheService exactCache;
private final SemanticCacheService semanticCache;
private final CacheStrategyManager strategyManager;
private final ChatLanguageModel llm;
private final MeterRegistry meterRegistry;
// 缓存统计
private final Counter exactHitCounter;
private final Counter semanticHitCounter;
private final Counter cacheMissCounter;
@Autowired
public CachedLlmService(ExactCacheService exactCache,
SemanticCacheService semanticCache,
CacheStrategyManager strategyManager,
ChatLanguageModel llm,
MeterRegistry meterRegistry) {
this.exactCache = exactCache;
this.semanticCache = semanticCache;
this.strategyManager = strategyManager;
this.llm = llm;
this.meterRegistry = meterRegistry;
this.exactHitCounter = Counter.builder("llm.cache.exact.hit").register(meterRegistry);
this.semanticHitCounter = Counter.builder("llm.cache.semantic.hit").register(meterRegistry);
this.cacheMissCounter = Counter.builder("llm.cache.miss").register(meterRegistry);
}
/**
* 带缓存的LLM调用
*
* @param query 用户问题
* @param scope 使用场景
* @param systemPrompt 系统提示
*/
public LlmResponse generate(String query, String scope, String systemPrompt) {
CacheStrategyManager.CacheStrategyConfig strategy = strategyManager.getStrategy(scope);
if (!strategy.isEnabled()) {
// 该场景禁用缓存,直接调用LLM
return callLlmDirectly(query, systemPrompt, false);
}
// Step 1:尝试精确缓存
LlmRequest exactRequest = buildRequest(query, systemPrompt);
String exactKey = exactCache.generateCacheKey(exactRequest);
Optional<String> exactResult = exactCache.get(exactKey);
if (exactResult.isPresent()) {
exactHitCounter.increment();
log.debug("精确缓存命中: scope={}", scope);
return LlmResponse.fromCache(exactResult.get(), CacheSource.EXACT);
}
// Step 2:尝试语义缓存
SemanticCacheService.CacheContext cacheContext =
SemanticCacheService.CacheContext.builder()
.scope(scope)
.language(detectLanguage(query))
.build();
// 临时覆盖阈值(使用场景特定的阈值)
Optional<SemanticCacheService.SemanticCacheHit> semanticResult =
semanticCache.lookup(query, cacheContext);
if (semanticResult.isPresent()) {
SemanticCacheService.SemanticCacheHit hit = semanticResult.get();
semanticHitCounter.increment();
log.debug("语义缓存命中: scope={}, similarity={}", scope, hit.similarity());
return LlmResponse.fromCache(hit.answer(), CacheSource.SEMANTIC,
hit.originalQuery(), hit.similarity(),
hit.cacheEntryId(), scope);
}
// Step 3:调用LLM,并存入缓存
cacheMissCounter.increment();
LlmResponse response = callLlmDirectly(query, systemPrompt, true);
// 异步存入两层缓存
String answer = response.getAnswer();
CompletableFuture.runAsync(() -> {
exactCache.put(exactKey, answer,
Duration.ofHours(strategy.getTtlHours()));
semanticCache.store(query, answer, cacheContext);
}).exceptionally(e -> {
log.warn("异步缓存存储失败: {}", e.getMessage());
return null;
});
return response;
}
/**
* 用户报告缓存结果不正确
*/
public void reportIncorrectCacheResult(String cacheEntryId, String scope, String feedback) {
strategyManager.recordNegativeFeedback(scope, cacheEntryId, feedback);
// 可以选择立即使该缓存条目失效
// semanticCache.invalidate(cacheEntryId);
}
private LlmResponse callLlmDirectly(String query, String systemPrompt, boolean trackLatency) {
long start = System.currentTimeMillis();
String answer;
try {
answer = llm.generate(List.of(
SystemMessage.from(systemPrompt),
UserMessage.from(query)
)).content().text();
} catch (Exception e) {
throw new RuntimeException("LLM调用失败", e);
}
long latency = System.currentTimeMillis() - start;
if (trackLatency) {
meterRegistry.timer("llm.call.latency").record(latency, java.util.concurrent.TimeUnit.MILLISECONDS);
}
log.debug("LLM直接调用: latencyMs={}", latency);
return LlmResponse.fromLlm(answer, latency);
}
private LlmRequest buildRequest(String query, String systemPrompt) {
return LlmRequest.builder()
.model("gpt-4o-mini")
.temperature(0.7)
.maxTokens(2000)
.messages(List.of(
new ChatMessageRecord("system", systemPrompt),
new ChatMessageRecord("user", query)
))
.build();
}
private String detectLanguage(String text) {
// 简单检测:如果包含中文字符就是中文
long chineseCount = text.chars()
.filter(c -> c >= 0x4E00 && c <= 0x9FFF)
.count();
return chineseCount > text.length() * 0.3 ? "zh" : "en";
}
@Data
@Builder
public static class LlmResponse {
private String answer;
private boolean fromCache;
private CacheSource cacheSource;
private String originalCachedQuery;
private Double similarity;
private String cacheEntryId;
private String scope;
private Long llmLatencyMs;
public static LlmResponse fromLlm(String answer, long latencyMs) {
return LlmResponse.builder()
.answer(answer).fromCache(false)
.llmLatencyMs(latencyMs).build();
}
public static LlmResponse fromCache(String answer, CacheSource source) {
return LlmResponse.builder()
.answer(answer).fromCache(true).cacheSource(source).build();
}
public static LlmResponse fromCache(String answer, CacheSource source,
String originalQuery, double similarity,
String entryId, String scope) {
return LlmResponse.builder()
.answer(answer).fromCache(true).cacheSource(source)
.originalCachedQuery(originalQuery).similarity(similarity)
.cacheEntryId(entryId).scope(scope).build();
}
}
public enum CacheSource { EXACT, SEMANTIC }
}缓存预热策略
/**
* 缓存预热服务
*
* 系统启动时预先加载高频查询的答案
* 减少冷启动时的LLM调用量
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class CacheWarmupService {
private final CachedLlmService cachedLlmService;
private final FaqRepository faqRepo;
/**
* 应用启动时预热FAQ缓存
*/
@EventListener(ApplicationReadyEvent.class)
public void warmupOnStartup() {
log.info("开始缓存预热...");
// 获取高频FAQ(按查询频率排序的前100条)
List<FrequentQuery> topQueries = faqRepo.findTopQueries(100);
if (topQueries.isEmpty()) {
log.info("没有历史查询数据,跳过预热");
return;
}
int warmed = 0;
int skipped = 0;
for (FrequentQuery fq : topQueries) {
try {
// 检查是否已经在缓存里了
// 如果语义缓存已经能命中,就不需要重新预热
// 直接调用带缓存的接口,如果命中则跳过,如果不命中则会存入缓存
CachedLlmService.LlmResponse response =
cachedLlmService.generate(fq.query(), fq.scope(), fq.systemPrompt());
if (response.isFromCache()) {
skipped++;
} else {
warmed++;
// 避免预热请求把LLM打满
Thread.sleep(200);
}
} catch (Exception e) {
log.warn("预热失败: query='{}': {}", fq.query(), e.getMessage());
}
}
log.info("缓存预热完成: warmed={}, skipped={}", warmed, skipped);
}
/**
* 批量预热(定时任务,每天更新)
*/
@Scheduled(cron = "0 0 2 * * *") // 每天凌晨2点
public void scheduledWarmup() {
// 获取昨天的高频查询
List<FrequentQuery> yesterdayQueries = faqRepo.findYesterdayTopQueries(50);
// 这些查询今天可能还会被问到,提前加载
yesterdayQueries.forEach(fq -> {
try {
cachedLlmService.generate(fq.query(), fq.scope(), fq.systemPrompt());
} catch (Exception e) {
log.warn("定时预热失败: {}", e.getMessage());
}
});
}
record FrequentQuery(String query, String scope, String systemPrompt, int frequency) {}
}缓存效果监控
/**
* 缓存效果看板数据服务
*
* 提供运营层面的缓存性能数据
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class CacheAnalyticsService {
private final RedisTemplate<String, String> redisTemplate;
private final MeterRegistry meterRegistry;
/**
* 获取缓存综合统计(日报表)
*/
public CacheStats getDailyStats() {
// 从Micrometer获取实时计数
double exactHits = getCounterValue("llm.cache.exact.hit");
double semanticHits = getCounterValue("llm.cache.semantic.hit");
double misses = getCounterValue("llm.cache.miss");
double totalRequests = exactHits + semanticHits + misses;
double hitRate = totalRequests > 0 ? (exactHits + semanticHits) / totalRequests : 0;
// 估算节省的成本
// 假设每次LLM调用平均消耗1000 tokens,GPT-4o-mini约$0.15/1M tokens
double savedTokens = (exactHits + semanticHits) * 1000;
double savedCost = savedTokens * 0.15 / 1_000_000;
// 估算节省的延迟
// 假设LLM平均800ms,缓存命中约5ms
double savedLatencySeconds = (exactHits + semanticHits) * 0.795;
return new CacheStats(
(long) exactHits,
(long) semanticHits,
(long) misses,
hitRate,
savedCost,
savedLatencySeconds
);
}
private double getCounterValue(String counterName) {
Counter counter = meterRegistry.find(counterName).counter();
return counter != null ? counter.count() : 0;
}
public record CacheStats(
long exactHits,
long semanticHits,
long misses,
double hitRate,
double estimatedCostSaved, // 美元
double estimatedTimeSaved // 秒
) {
@Override
public String toString() {
return String.format(
"缓存统计: 精确命中=%d, 语义命中=%d, 未命中=%d, 命中率=%.1f%%, " +
"节省费用=$%.4f, 节省延迟=%.1fs",
exactHits, semanticHits, misses, hitRate * 100,
estimatedCostSaved, estimatedTimeSaved
);
}
}
}实践建议
先测量,再实施
在投入精力实现语义缓存之前,先分析一下你的查询日志,看看实际的重复率。我见过一些场景,用户的问题高度个性化(比如"帮我分析我今天上传的这份合同"),根本没有重复性,加缓存没有意义。但FAQ型问答、通用助手等场景,24小时内的语义重复率通常能达到30-50%,缓存价值非常高。
相似度阈值是关键参数,要花时间调
0.93是个不错的起点,但不同业务场景差异很大。建议这样调:先用低阈值(0.85)收集数据,人工评估命中的缓存对是否真的语义等价,找到"刚好开始出现误命中"的临界点,然后往上加0.02-0.03作为安全边距。这个过程值得花2-3天时间,能避免后续大量的客诉。
缓存答案的时效性问题
对于时间敏感的问题("最新的XX是什么"、"今天的..."),要么直接禁用缓存,要么TTL设置很短(1小时内)。最坏的情况是用户问"最新的利率是多少",你给他返回了3天前缓存的答案。在System Prompt里加上"如果问题涉及实时信息,不要从缓存回答"是个可行的补充措施。
