Semantic Cache实战：AI接口语义缓存完整实现方案

老张2026/4/30大约 9 分钟

Semantic Cache实战：AI接口语义缓存完整实现方案

适读人群：有1-5年Java开发经验，想向AI工程师方向转型的开发者 阅读时长：约18分钟 文章价值：
深度理解语义缓存的原理与普通缓存的本质区别
掌握从零构建生产级语义缓存的完整实现
学会调优相似度阈值、缓存命中策略等关键参数

同一个意思，问了一百遍

这个问题坑了我好久。

我们的AI客服系统上线后，有段时间命中率统计让我很困惑：精确缓存的命中率只有12%，但按我的判断，用户问的问题其实重复度很高。

打开日志一看，全是这样的情况：

用户A：发票怎么开？
用户B：请问发票如何开具？
用户C：我想开发票，要怎么操作
用户D：开发票的流程是什么
用户E：帮我申请发票

五个用户，同一个意思，但字面上完全不同，精确缓存全部未命中，每个都乖乖调了一次LLM。

这就是语义缓存要解决的核心问题：语义相同但表达不同的问题，应该复用同一个答案。

今天我来写一个比 article-022 更深入的版本，专门讲语义缓存的细节——如何构建向量索引、如何调阈值、如何处理时效性、如何做缓存预热。

语义缓存 vs 精确缓存

完整架构设计

核心实现

语义缓存引擎

package com.laozhang.ai.semantic;

import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.embedding.EmbeddingModel;
import org.springframework.ai.vectorstore.SearchRequest;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.ai.document.Document;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.stereotype.Service;

import java.time.Duration;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;

/**
 * 语义缓存引擎
 * 
 * 架构说明：
 * - 向量索引：存储问题的向量表示（用于相似度检索）
 * - 答案存储：Redis Hash，存储 questionId -> answer 映射
 * - 两者分离设计：向量索引负责"找到相似"，Redis负责"快速读取答案"
 */
@Service
@Slf4j
@RequiredArgsConstructor
public class SemanticCacheEngine {

    private final VectorStore vectorStore;
    private final EmbeddingModel embeddingModel;
    private final RedisTemplate<String, String> redisTemplate;

    @Value("${semantic-cache.similarity-threshold:0.92}")
    private double similarityThreshold;

    @Value("${semantic-cache.ttl-hours:24}")
    private int ttlHours;

    @Value("${semantic-cache.max-entries:100000}")
    private int maxEntries;

    private static final String ANSWER_KEY_PREFIX = "sematic_cache:answer:";
    private static final String STATS_KEY = "semantic_cache:stats";

    // 异步写入执行器（避免缓存写入阻塞主流程）
    private final Executor asyncExecutor = Executors.newVirtualThreadPerTaskExecutor();

    /**
     * 查询语义缓存
     * 
     * @param question 用户问题
     * @return Optional包含缓存命中的答案，empty则未命中
     */
    public Optional<CacheHit> get(String question) {
        long startTime = System.currentTimeMillis();
        
        try {
            // 向量化查询词
            SearchRequest searchRequest = SearchRequest.query(question)
                .withTopK(3)  // 取Top3，选最好的
                .withSimilarityThreshold(similarityThreshold);
            
            List<Document> results = vectorStore.similaritySearch(searchRequest);
            
            if (results.isEmpty()) {
                recordStats("miss");
                return Optional.empty();
            }
            
            // 取相似度最高的结果
            Document best = results.get(0);
            String questionId = (String) best.getMetadata().get("question_id");
            
            if (questionId == null) {
                return Optional.empty();
            }
            
            // 从Redis读取答案（快速）
            String answer = redisTemplate.opsForValue().get(
                ANSWER_KEY_PREFIX + questionId
            );
            
            if (answer == null) {
                // 向量存在但答案已过期（TTL不一致导致），清理并返回未命中
                log.warn("语义缓存向量存在但答案已过期: questionId={}", questionId);
                recordStats("miss");
                return Optional.empty();
            }
            
            // 计算实际相似度分数（从metadata获取）
            double similarity = extractSimilarity(best);
            long latency = System.currentTimeMillis() - startTime;
            
            recordStats("hit");
            log.debug("[语义缓存命中] 相似度={}, 延迟={}ms, 问题={}",
                String.format("%.3f", similarity), latency,
                question.substring(0, Math.min(30, question.length())));
            
            return Optional.of(new CacheHit(answer, similarity, questionId));
            
        } catch (Exception e) {
            log.warn("语义缓存查询异常: {}", e.getMessage());
            return Optional.empty();
        }
    }

    /**
     * 将新的问答对存入语义缓存
     * 异步执行，不阻塞主流程
     */
    public void putAsync(String question, String answer) {
        CompletableFuture.runAsync(() -> put(question, answer), asyncExecutor);
    }

    /**
     * 同步存入缓存
     */
    public void put(String question, String answer) {
        try {
            String questionId = UUID.randomUUID().toString();
            
            // 1. 将问题存入向量索引（包含questionId作为元数据）
            Document doc = new Document(
                question,  // 用问题文本生成向量
                Map.of(
                    "question_id", questionId,
                    "question_preview", question.substring(0, Math.min(100, question.length())),
                    "created_at", String.valueOf(System.currentTimeMillis())
                )
            );
            vectorStore.add(List.of(doc));
            
            // 2. 将答案存入Redis（questionId -> answer）
            redisTemplate.opsForValue().set(
                ANSWER_KEY_PREFIX + questionId,
                answer,
                Duration.ofHours(ttlHours)
            );
            
            log.debug("[语义缓存写入] questionId={}, 问题长度={}, 答案长度={}",
                questionId, question.length(), answer.length());
            
        } catch (Exception e) {
            log.error("语义缓存写入失败: {}", e.getMessage());
        }
    }

    /**
     * 缓存预热：将高频问题预先加载到缓存
     * 建议在系统启动后执行
     */
    public void warmUp(List<QAPair> frequentQAs) {
        log.info("开始语义缓存预热，共{}条数据", frequentQAs.size());
        
        frequentQAs.parallelStream().forEach(qa -> {
            // 先检查是否已经缓存过
            Optional<CacheHit> existing = get(qa.question());
            if (existing.isEmpty()) {
                put(qa.question(), qa.answer());
            }
        });
        
        log.info("语义缓存预热完成");
    }

    private double extractSimilarity(Document doc) {
        // Spring AI VectorStore返回的相似度在metadata里
        Object score = doc.getMetadata().get("score");
        if (score instanceof Double d) return d;
        if (score instanceof Float f) return f.doubleValue();
        return similarityThreshold; // 默认返回阈值
    }

    private void recordStats(String type) {
        try {
            redisTemplate.opsForHash().increment(STATS_KEY, "total", 1);
            redisTemplate.opsForHash().increment(STATS_KEY, type, 1);
        } catch (Exception ignored) {}
    }

    /**
     * 获取缓存统计信息
     */
    public CacheStats getStats() {
        try {
            Map<Object, Object> stats = redisTemplate.opsForHash().entries(STATS_KEY);
            long total = getLong(stats, "total");
            long hits = getLong(stats, "hit");
            long misses = getLong(stats, "miss");
            double hitRate = total > 0 ? (double) hits / total * 100 : 0;
            
            return new CacheStats(total, hits, misses, hitRate);
        } catch (Exception e) {
            return new CacheStats(0, 0, 0, 0);
        }
    }

    private long getLong(Map<Object, Object> map, String key) {
        Object val = map.get(key);
        if (val == null) return 0;
        return Long.parseLong(val.toString());
    }

    public record CacheHit(String answer, double similarity, String questionId) {}
    public record QAPair(String question, String answer) {}
    public record CacheStats(long total, long hits, long misses, double hitRate) {}
}

阈值自适应调整器

这个是高级特性，能根据历史数据自动调整阈值：

/**
 * 语义缓存阈值自适应调整器
 * 
 * 问题：固定阈值0.92不一定是最优的
 * - 阈值太高：命中率低，省不了多少成本
 * - 阈值太低：会把不同问题的答案混用，造成答案不准确
 * 
 * 解决方案：根据用户反馈数据动态调整阈值
 */
@Service
@Slf4j
@RequiredArgsConstructor
public class AdaptiveThresholdService {

    private final RedisTemplate<String, String> redisTemplate;

    // 阈值调整的边界
    private static final double MIN_THRESHOLD = 0.85;
    private static final double MAX_THRESHOLD = 0.98;
    private static final double ADJUSTMENT_STEP = 0.01;

    private static final String FEEDBACK_KEY = "cache:feedback";
    private static final String THRESHOLD_KEY = "cache:current_threshold";

    /**
     * 记录用户对缓存命中质量的反馈
     * 
     * @param questionId  被命中的缓存条目ID
     * @param similarity  命中时的相似度分数
     * @param isAccurate  用户是否认为答案准确（通过点赞/点踩收集）
     */
    public void recordFeedback(String questionId, double similarity, boolean isAccurate) {
        String field = String.format("%.2f", similarity);  // 按相似度分组
        String key = isAccurate ? "accurate:" + field : "inaccurate:" + field;
        redisTemplate.opsForHash().increment(FEEDBACK_KEY, key, 1);
        
        // 积累足够数据后触发阈值重新评估
        long totalFeedback = redisTemplate.opsForHash().size(FEEDBACK_KEY);
        if (totalFeedback % 100 == 0) {  // 每100条反馈评估一次
            evaluateAndAdjustThreshold();
        }
    }

    /**
     * 分析历史反馈，找出最优阈值
     */
    public void evaluateAndAdjustThreshold() {
        Map<Object, Object> feedback = redisTemplate.opsForHash().entries(FEEDBACK_KEY);
        
        // 计算各相似度区间的准确率
        Map<Double, Double> accuracyByScore = new TreeMap<>();
        
        for (double score = MIN_THRESHOLD; score <= MAX_THRESHOLD; score += 0.01) {
            String field = String.format("%.2f", score);
            long accurate = getLong(feedback, "accurate:" + field);
            long inaccurate = getLong(feedback, "inaccurate:" + field);
            long total = accurate + inaccurate;
            
            if (total >= 10) {  // 至少10条数据才统计
                accuracyByScore.put(score, (double) accurate / total);
            }
        }
        
        // 找出准确率首次达到95%的最低阈值
        double optimalThreshold = 0.92; // 默认值
        for (Map.Entry<Double, Double> entry : accuracyByScore.entrySet()) {
            if (entry.getValue() >= 0.95) {
                optimalThreshold = entry.getKey();
                break;
            }
        }
        
        // 更新当前阈值
        double currentThreshold = getCurrentThreshold();
        if (Math.abs(optimalThreshold - currentThreshold) > 0.01) {
            redisTemplate.opsForValue().set(
                THRESHOLD_KEY, String.valueOf(optimalThreshold)
            );
            log.info("[阈值调整] {} -> {}", currentThreshold, optimalThreshold);
        }
    }

    public double getCurrentThreshold() {
        String val = (String) redisTemplate.opsForValue().get(THRESHOLD_KEY);
        return val != null ? Double.parseDouble(val) : 0.92;
    }

    private long getLong(Map<Object, Object> map, String key) {
        Object val = map.get(key);
        return val == null ? 0 : Long.parseLong(val.toString());
    }
}

与Spring AI的完整集成

/**
 * 集成语义缓存的AI服务门面
 * 自动处理缓存查询、LLM调用、结果缓存
 */
@Service
@Slf4j
@RequiredArgsConstructor
public class CachedAIFacade {

    private final ChatClient chatClient;
    private final SemanticCacheEngine semanticCache;
    private final AdaptiveThresholdService thresholdService;

    /**
     * 带语义缓存的AI问答
     * 对上层业务透明，自动处理缓存逻辑
     */
    public AIAnswer ask(String question, String systemContext) {
        long start = System.currentTimeMillis();
        
        // 1. 先查语义缓存
        Optional<SemanticCacheEngine.CacheHit> hit = semanticCache.get(question);
        
        if (hit.isPresent()) {
            SemanticCacheEngine.CacheHit cacheHit = hit.get();
            return AIAnswer.builder()
                .answer(cacheHit.answer())
                .fromCache(true)
                .similarity(cacheHit.similarity())
                .latencyMs(System.currentTimeMillis() - start)
                .cacheQuestionId(cacheHit.questionId())
                .build();
        }
        
        // 2. 缓存未命中，调用LLM
        String answer = chatClient.prompt()
            .system(systemContext)
            .user(question)
            .call()
            .content();
        
        long latency = System.currentTimeMillis() - start;
        
        // 3. 异步写入缓存（不阻塞响应）
        semanticCache.putAsync(question, answer);
        
        return AIAnswer.builder()
            .answer(answer)
            .fromCache(false)
            .similarity(0.0)
            .latencyMs(latency)
            .build();
    }

    /**
     * 用户对答案的反馈（用于阈值自适应）
     */
    public void feedback(String questionId, double similarity, boolean accurate) {
        thresholdService.recordFeedback(questionId, similarity, accurate);
    }

    @lombok.Builder
    public record AIAnswer(
        String answer,
        boolean fromCache,
        double similarity,
        long latencyMs,
        String cacheQuestionId
    ) {}
}

性能对比与调优建议

场景	无缓存	精确缓存	语义缓存（threshold=0.92）	语义缓存（自适应）
重复率高的FAQ	100% LLM	命中率30%	命中率75%	命中率80%
自由对话	100% LLM	命中率5%	命中率35%	命中率40%
技术问答	100% LLM	命中率20%	命中率60%	命中率65%
平均响应延迟	2000ms	50ms（命中）/2000ms（未中）	100ms（命中）/2000ms（未中）	同左

调阈值的经验：

0.95以上：几乎等于精确匹配，命中率低但不会出错
0.92-0.95：推荐范围，平衡命中率和准确率
0.88-0.92：命中率高但有时会用错答案，需要监控
0.88以下：风险区，不同问题的答案可能被混用

生产注意事项

缓存失效问题：如果知识库更新了（比如政策变化），旧的缓存答案就过期了。建议给所有缓存条目设置合理的TTL（24-48小时），并在知识库更新时主动清空相关缓存。

冷启动问题：新系统上线时没有缓存数据，命中率为0。建议做预热——把历史问答记录（脱敏处理）批量导入，让系统第一天就有合理的命中率。

缓存污染：如果某次LLM给了错误答案，被缓存了，后续相似问题都会拿到错误答案。建议加入用户反馈机制，可以标记某个缓存条目"答案有误"，触发重新调用LLM生成。

向量索引容量：100万条缓存条目约需要6-8GB向量存储（1536维），要规划好存储资源。超过一定数量后需要考虑LRU淘汰或分层存储。