第2047篇：Redis在LLM应用中的五种用法

老张2026/4/30大约 7 分钟

第2047篇：Redis在LLM应用中的五种用法

适读人群：在Java项目中构建LLM应用的工程师 | 阅读时长：约19分钟 | 核心价值：掌握Redis在LLM应用中的核心使用模式，降低成本、提升性能

LLM应用和普通Web应用的一个很大区别：每次API调用都比较贵，而且慢（几百毫秒到几秒）。

Redis在这里的价值不只是缓存那么简单。我把LLM应用里Redis的用法整理了一下，主要是这五种场景。

用法一：语义缓存（最省钱的优化）

普通缓存是精确匹配，语义缓存是相似度匹配：

/**
 * 语义缓存实现
 * 核心思路：把问题向量化后存Redis，检索时找相似问题
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class SemanticCacheService {
    
    private final RedisTemplate<String, String> redisTemplate;
    private final EmbeddingModel embeddingModel;
    private final ObjectMapper objectMapper;
    
    // 相似度阈值：超过这个值认为是同一个问题
    private static final double SIMILARITY_THRESHOLD = 0.92;
    // 缓存有效期
    private static final Duration CACHE_TTL = Duration.ofHours(24);
    // Redis中存储的最大缓存条数
    private static final int MAX_CACHE_SIZE = 10000;
    
    private static final String CACHE_KEY_PREFIX = "semantic-cache:";
    private static final String CACHE_INDEX_KEY = "semantic-cache:index";
    
    /**
     * 语义缓存查找
     * 返回null表示缓存未命中
     */
    public String get(String question) {
        float[] questionEmbedding = embeddingModel.embed(question);
        
        // 从Redis取出所有缓存的问题（实际生产中需要用向量索引优化这一步）
        Set<String> cachedKeys = redisTemplate.opsForSet().members(CACHE_INDEX_KEY);
        if (cachedKeys == null || cachedKeys.isEmpty()) {
            return null;
        }
        
        String bestMatchKey = null;
        double bestSimilarity = 0;
        
        for (String key : cachedKeys) {
            String entryJson = redisTemplate.opsForValue().get(key);
            if (entryJson == null) continue;
            
            try {
                CacheEntry entry = objectMapper.readValue(entryJson, CacheEntry.class);
                double similarity = cosineSimilarity(questionEmbedding, entry.getEmbedding());
                
                if (similarity > SIMILARITY_THRESHOLD && similarity > bestSimilarity) {
                    bestSimilarity = similarity;
                    bestMatchKey = key;
                }
            } catch (Exception e) {
                log.warn("读取缓存条目失败: {}", key, e);
            }
        }
        
        if (bestMatchKey != null) {
            try {
                String entryJson = redisTemplate.opsForValue().get(bestMatchKey);
                CacheEntry entry = objectMapper.readValue(entryJson, CacheEntry.class);
                log.info("语义缓存命中，相似度: {:.3f}, 问题: {}", bestSimilarity, question);
                return entry.getAnswer();
            } catch (Exception e) {
                log.warn("读取缓存失败", e);
            }
        }
        
        return null;
    }
    
    /**
     * 存入语义缓存
     */
    public void put(String question, String answer) {
        float[] embedding = embeddingModel.embed(question);
        
        CacheEntry entry = new CacheEntry(question, embedding, answer, 
            System.currentTimeMillis());
        
        String key = CACHE_KEY_PREFIX + UUID.randomUUID();
        
        try {
            String json = objectMapper.writeValueAsString(entry);
            redisTemplate.opsForValue().set(key, json, CACHE_TTL);
            redisTemplate.opsForSet().add(CACHE_INDEX_KEY, key);
            
            // 限制缓存大小
            Long size = redisTemplate.opsForSet().size(CACHE_INDEX_KEY);
            if (size != null && size > MAX_CACHE_SIZE) {
                // 随机移除一个旧缓存（生产环境可以用LRU策略）
                String oldKey = redisTemplate.opsForSet().pop(CACHE_INDEX_KEY);
                if (oldKey != null) {
                    redisTemplate.delete(oldKey);
                }
            }
        } catch (JsonProcessingException e) {
            log.error("缓存写入失败", e);
        }
    }
    
    private double cosineSimilarity(float[] a, float[] b) {
        double dot = 0, na = 0, nb = 0;
        for (int i = 0; i < a.length; i++) {
            dot += a[i] * b[i];
            na += a[i] * a[i];
            nb += b[i] * b[i];
        }
        return dot / (Math.sqrt(na) * Math.sqrt(nb));
    }
    
    @Data
    @AllArgsConstructor
    @NoArgsConstructor
    private static class CacheEntry {
        private String question;
        private float[] embedding;
        private String answer;
        private long timestamp;
    }
}

/**
 * 包装LLM调用，加上语义缓存层
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class CachedLlmService {
    
    private final ChatLanguageModel llmModel;
    private final SemanticCacheService semanticCache;
    
    public String ask(String question) {
        // 先查缓存
        String cached = semanticCache.get(question);
        if (cached != null) {
            return cached;
        }
        
        // 缓存未命中，调用LLM
        String answer = llmModel.generate(question);
        
        // 异步写入缓存（不影响响应时间）
        CompletableFuture.runAsync(() -> semanticCache.put(question, answer));
        
        return answer;
    }
}

实际效果：对于重复率高的场景（如FAQ问答），语义缓存命中率可以达到40-60%，对应的API成本直接减少。

用法二：速率限制

LLM API有每分钟Token限制，内部用户也需要防止单个用户把配额耗尽：

/**
 * Redis实现的LLM调用速率限制
 * 使用滑动窗口算法
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class LlmRateLimiter {
    
    private final StringRedisTemplate redisTemplate;
    
    // 每个用户每分钟最多调用次数
    private static final int MAX_CALLS_PER_MINUTE = 20;
    // 每个用户每天最多消耗的token数
    private static final int MAX_TOKENS_PER_DAY = 100_000;
    
    /**
     * 检查是否超过速率限制（令牌桶算法的简化版：固定窗口）
     */
    public RateLimitResult checkRateLimit(String userId, int estimatedTokens) {
        long now = System.currentTimeMillis();
        long windowStart = now - 60_000;  // 1分钟前
        
        // 1. 检查调用次数限制（1分钟滑动窗口）
        String callKey = "rate:calls:" + userId;
        redisTemplate.opsForZSet().removeRangeByScore(callKey, 0, windowStart);
        Long callCount = redisTemplate.opsForZSet().count(callKey, windowStart, now);
        
        if (callCount != null && callCount >= MAX_CALLS_PER_MINUTE) {
            log.warn("用户{}调用频率超限: {}次/分", userId, callCount);
            return RateLimitResult.rateLimited("调用频率超出限制，请稍后重试");
        }
        
        // 2. 检查当日Token限制
        String tokenKey = "rate:tokens:" + userId + ":" + getCurrentDate();
        String tokenCountStr = redisTemplate.opsForValue().get(tokenKey);
        long tokenCount = tokenCountStr != null ? Long.parseLong(tokenCountStr) : 0;
        
        if (tokenCount + estimatedTokens > MAX_TOKENS_PER_DAY) {
            log.warn("用户{}今日Token超限: {}", userId, tokenCount);
            return RateLimitResult.rateLimited("今日使用量已达上限");
        }
        
        return RateLimitResult.allowed();
    }
    
    /**
     * 记录一次调用（在LLM调用成功后调用）
     */
    public void recordCall(String userId, int actualTokensUsed) {
        long now = System.currentTimeMillis();
        
        // 记录调用时间（用于滑动窗口）
        String callKey = "rate:calls:" + userId;
        redisTemplate.opsForZSet().add(callKey, String.valueOf(now), now);
        redisTemplate.expire(callKey, Duration.ofMinutes(2));
        
        // 累加当日Token消耗
        String tokenKey = "rate:tokens:" + userId + ":" + getCurrentDate();
        redisTemplate.opsForValue().increment(tokenKey, actualTokensUsed);
        redisTemplate.expire(tokenKey, Duration.ofDays(2));
    }
    
    private String getCurrentDate() {
        return LocalDate.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
    }
    
    public record RateLimitResult(boolean allowed, String message) {
        public static RateLimitResult allowed() {
            return new RateLimitResult(true, null);
        }
        public static RateLimitResult rateLimited(String message) {
            return new RateLimitResult(false, message);
        }
    }
}

用法三：分布式对话记忆

前面LangChain4j的文章里提到了Redis持久化ChatMemory。这里再说一个用法：跨服务的会话共享。

/**
 * 分布式对话上下文
 * 支持多个服务实例共享同一个用户的对话状态
 */
@Service
@RequiredArgsConstructor
public class DistributedConversationContext {
    
    private final RedisTemplate<String, Object> redisTemplate;
    
    private static final String CTX_KEY_PREFIX = "conversation:ctx:";
    private static final Duration CTX_TTL = Duration.ofHours(2);
    
    /**
     * 保存当前对话中提取的关键上下文
     * 比完整对话历史更精简，适合跨服务传递
     */
    public void updateContext(String sessionId, ConversationContext ctx) {
        String key = CTX_KEY_PREFIX + sessionId;
        redisTemplate.opsForHash().putAll(key, ctx.toMap());
        redisTemplate.expire(key, CTX_TTL);
    }
    
    public ConversationContext getContext(String sessionId) {
        String key = CTX_KEY_PREFIX + sessionId;
        Map<Object, Object> map = redisTemplate.opsForHash().entries(key);
        return ConversationContext.fromMap(map);
    }
    
    @Data
    @Builder
    public static class ConversationContext {
        private String userId;
        private String lastOrderId;         // 最近讨论的订单
        private String lastProductId;       // 最近讨论的商品
        private String userIntent;          // 当前意图
        private String pendingConfirmation; // 待确认的操作
        
        public Map<String, String> toMap() {
            Map<String, String> map = new HashMap<>();
            if (userId != null) map.put("userId", userId);
            if (lastOrderId != null) map.put("lastOrderId", lastOrderId);
            if (lastProductId != null) map.put("lastProductId", lastProductId);
            if (userIntent != null) map.put("userIntent", userIntent);
            if (pendingConfirmation != null) map.put("pendingConfirmation", pendingConfirmation);
            return map;
        }
        
        public static ConversationContext fromMap(Map<Object, Object> map) {
            return ConversationContext.builder()
                .userId((String) map.get("userId"))
                .lastOrderId((String) map.get("lastOrderId"))
                .lastProductId((String) map.get("lastProductId"))
                .userIntent((String) map.get("userIntent"))
                .pendingConfirmation((String) map.get("pendingConfirmation"))
                .build();
        }
    }
}

用法四：LLM请求队列

高峰期LLM请求超出并发限制时，用Redis队列做缓冲：

/**
 * LLM请求队列
 * 高峰期请求入队，控制并发避免超出API限制
 */
@Service
@RequiredArgsConstructor
@Slf4j
public class LlmRequestQueue {
    
    private final StringRedisTemplate redisTemplate;
    private final ChatLanguageModel llmModel;
    
    private static final String QUEUE_KEY = "llm:request:queue";
    private static final String RESULT_KEY_PREFIX = "llm:result:";
    private static final int MAX_CONCURRENT = 5;  // 最大并发LLM调用数
    
    private final Semaphore concurrencySemaphore = new Semaphore(MAX_CONCURRENT);
    
    /**
     * 提交请求，返回请求ID
     */
    public String submitRequest(String userId, String prompt) {
        String requestId = UUID.randomUUID().toString();
        
        LlmRequest request = new LlmRequest(requestId, userId, prompt, 
            System.currentTimeMillis());
        
        redisTemplate.opsForList().rightPush(QUEUE_KEY, toJson(request));
        log.info("LLM请求入队: requestId={}, 队列长度: {}", 
            requestId, redisTemplate.opsForList().size(QUEUE_KEY));
        
        return requestId;
    }
    
    /**
     * 查询请求结果（轮询接口）
     */
    public Optional<String> getResult(String requestId) {
        String resultKey = RESULT_KEY_PREFIX + requestId;
        String result = redisTemplate.opsForValue().get(resultKey);
        return Optional.ofNullable(result);
    }
    
    /**
     * 队列消费者（后台运行）
     */
    @Scheduled(fixedDelay = 100)  // 每100ms轮询一次
    public void processQueue() {
        if (concurrencySemaphore.availablePermits() == 0) {
            return;  // 并发数已满，等待
        }
        
        String requestJson = redisTemplate.opsForList().leftPop(QUEUE_KEY);
        if (requestJson == null) {
            return;  // 队列为空
        }
        
        LlmRequest request = fromJson(requestJson, LlmRequest.class);
        
        // 异步处理
        CompletableFuture.runAsync(() -> {
            try {
                concurrencySemaphore.acquire();
                String answer = llmModel.generate(request.getPrompt());
                
                // 结果存Redis，有效期10分钟
                String resultKey = RESULT_KEY_PREFIX + request.getRequestId();
                redisTemplate.opsForValue().set(resultKey, answer, Duration.ofMinutes(10));
                
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            } finally {
                concurrencySemaphore.release();
            }
        });
    }
    
    @Data
    @AllArgsConstructor
    private static class LlmRequest {
        private String requestId;
        private String userId;
        private String prompt;
        private long submitTime;
    }
}

用法五：LLM调用结果的精确缓存

对于确定性的LLM调用（temperature=0，每次结果相同），可以做精确缓存：

/**
 * 确定性LLM调用缓存
 * 适用：数据提取、分类等temperature=0的场景
 */
@Aspect
@Component
@RequiredArgsConstructor
@Slf4j
public class DeterministicLlmCacheAspect {
    
    private final StringRedisTemplate redisTemplate;
    
    @Around("@annotation(cacheLlm)")
    public Object cacheAroundLlmCall(ProceedingJoinPoint pjp, 
                                      CacheLlmResult cacheLlm) throws Throwable {
        // 生成缓存key（方法名+参数的哈希）
        String cacheKey = buildCacheKey(pjp);
        
        // 查缓存
        String cached = redisTemplate.opsForValue().get(cacheKey);
        if (cached != null) {
            log.debug("LLM结果缓存命中: {}", cacheKey);
            return deserialize(cached, pjp.getSignature());
        }
        
        // 调用实际方法
        Object result = pjp.proceed();
        
        // 存缓存
        String serialized = serialize(result);
        Duration ttl = Duration.of(cacheLlm.ttlHours(), ChronoUnit.HOURS);
        redisTemplate.opsForValue().set(cacheKey, serialized, ttl);
        
        return result;
    }
    
    private String buildCacheKey(ProceedingJoinPoint pjp) {
        String methodName = pjp.getSignature().toShortString();
        String argsHash = DigestUtils.md5DigestAsHex(
            Arrays.deepToString(pjp.getArgs()).getBytes(StandardCharsets.UTF_8));
        return "llm-cache:" + methodName + ":" + argsHash;
    }
}

@Target(ElementType.METHOD)
@Retention(RetentionPolicy.RUNTIME)
public @interface CacheLlmResult {
    int ttlHours() default 24;
}

// 使用方式
@Service
public class TextAnalysisService {
    
    private final ChatLanguageModel llm;
    
    @CacheLlmResult(ttlHours = 48)
    public TextCategory classify(String text) {
        String result = llm.generate("对以下文本进行分类：" + text);
        return TextCategory.valueOf(result.trim().toUpperCase());
    }
}

这五种用法，从成本优化到性能提升，覆盖了LLM应用中Redis的主要使用场景。优先级排序：语义缓存 > 速率限制 > 对话记忆 > 精确缓存 > 请求队列。根据你的场景选择性实现。