第2047篇:Redis在LLM应用中的五种用法
2026/4/30大约 7 分钟
第2047篇:Redis在LLM应用中的五种用法
适读人群:在Java项目中构建LLM应用的工程师 | 阅读时长:约19分钟 | 核心价值:掌握Redis在LLM应用中的核心使用模式,降低成本、提升性能
LLM应用和普通Web应用的一个很大区别:每次API调用都比较贵,而且慢(几百毫秒到几秒)。
Redis在这里的价值不只是缓存那么简单。我把LLM应用里Redis的用法整理了一下,主要是这五种场景。
用法一:语义缓存(最省钱的优化)
普通缓存是精确匹配,语义缓存是相似度匹配:
/**
* 语义缓存实现
* 核心思路:把问题向量化后存Redis,检索时找相似问题
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class SemanticCacheService {
private final RedisTemplate<String, String> redisTemplate;
private final EmbeddingModel embeddingModel;
private final ObjectMapper objectMapper;
// 相似度阈值:超过这个值认为是同一个问题
private static final double SIMILARITY_THRESHOLD = 0.92;
// 缓存有效期
private static final Duration CACHE_TTL = Duration.ofHours(24);
// Redis中存储的最大缓存条数
private static final int MAX_CACHE_SIZE = 10000;
private static final String CACHE_KEY_PREFIX = "semantic-cache:";
private static final String CACHE_INDEX_KEY = "semantic-cache:index";
/**
* 语义缓存查找
* 返回null表示缓存未命中
*/
public String get(String question) {
float[] questionEmbedding = embeddingModel.embed(question);
// 从Redis取出所有缓存的问题(实际生产中需要用向量索引优化这一步)
Set<String> cachedKeys = redisTemplate.opsForSet().members(CACHE_INDEX_KEY);
if (cachedKeys == null || cachedKeys.isEmpty()) {
return null;
}
String bestMatchKey = null;
double bestSimilarity = 0;
for (String key : cachedKeys) {
String entryJson = redisTemplate.opsForValue().get(key);
if (entryJson == null) continue;
try {
CacheEntry entry = objectMapper.readValue(entryJson, CacheEntry.class);
double similarity = cosineSimilarity(questionEmbedding, entry.getEmbedding());
if (similarity > SIMILARITY_THRESHOLD && similarity > bestSimilarity) {
bestSimilarity = similarity;
bestMatchKey = key;
}
} catch (Exception e) {
log.warn("读取缓存条目失败: {}", key, e);
}
}
if (bestMatchKey != null) {
try {
String entryJson = redisTemplate.opsForValue().get(bestMatchKey);
CacheEntry entry = objectMapper.readValue(entryJson, CacheEntry.class);
log.info("语义缓存命中,相似度: {:.3f}, 问题: {}", bestSimilarity, question);
return entry.getAnswer();
} catch (Exception e) {
log.warn("读取缓存失败", e);
}
}
return null;
}
/**
* 存入语义缓存
*/
public void put(String question, String answer) {
float[] embedding = embeddingModel.embed(question);
CacheEntry entry = new CacheEntry(question, embedding, answer,
System.currentTimeMillis());
String key = CACHE_KEY_PREFIX + UUID.randomUUID();
try {
String json = objectMapper.writeValueAsString(entry);
redisTemplate.opsForValue().set(key, json, CACHE_TTL);
redisTemplate.opsForSet().add(CACHE_INDEX_KEY, key);
// 限制缓存大小
Long size = redisTemplate.opsForSet().size(CACHE_INDEX_KEY);
if (size != null && size > MAX_CACHE_SIZE) {
// 随机移除一个旧缓存(生产环境可以用LRU策略)
String oldKey = redisTemplate.opsForSet().pop(CACHE_INDEX_KEY);
if (oldKey != null) {
redisTemplate.delete(oldKey);
}
}
} catch (JsonProcessingException e) {
log.error("缓存写入失败", e);
}
}
private double cosineSimilarity(float[] a, float[] b) {
double dot = 0, na = 0, nb = 0;
for (int i = 0; i < a.length; i++) {
dot += a[i] * b[i];
na += a[i] * a[i];
nb += b[i] * b[i];
}
return dot / (Math.sqrt(na) * Math.sqrt(nb));
}
@Data
@AllArgsConstructor
@NoArgsConstructor
private static class CacheEntry {
private String question;
private float[] embedding;
private String answer;
private long timestamp;
}
}
/**
* 包装LLM调用,加上语义缓存层
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class CachedLlmService {
private final ChatLanguageModel llmModel;
private final SemanticCacheService semanticCache;
public String ask(String question) {
// 先查缓存
String cached = semanticCache.get(question);
if (cached != null) {
return cached;
}
// 缓存未命中,调用LLM
String answer = llmModel.generate(question);
// 异步写入缓存(不影响响应时间)
CompletableFuture.runAsync(() -> semanticCache.put(question, answer));
return answer;
}
}实际效果:对于重复率高的场景(如FAQ问答),语义缓存命中率可以达到40-60%,对应的API成本直接减少。
用法二:速率限制
LLM API有每分钟Token限制,内部用户也需要防止单个用户把配额耗尽:
/**
* Redis实现的LLM调用速率限制
* 使用滑动窗口算法
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class LlmRateLimiter {
private final StringRedisTemplate redisTemplate;
// 每个用户每分钟最多调用次数
private static final int MAX_CALLS_PER_MINUTE = 20;
// 每个用户每天最多消耗的token数
private static final int MAX_TOKENS_PER_DAY = 100_000;
/**
* 检查是否超过速率限制(令牌桶算法的简化版:固定窗口)
*/
public RateLimitResult checkRateLimit(String userId, int estimatedTokens) {
long now = System.currentTimeMillis();
long windowStart = now - 60_000; // 1分钟前
// 1. 检查调用次数限制(1分钟滑动窗口)
String callKey = "rate:calls:" + userId;
redisTemplate.opsForZSet().removeRangeByScore(callKey, 0, windowStart);
Long callCount = redisTemplate.opsForZSet().count(callKey, windowStart, now);
if (callCount != null && callCount >= MAX_CALLS_PER_MINUTE) {
log.warn("用户{}调用频率超限: {}次/分", userId, callCount);
return RateLimitResult.rateLimited("调用频率超出限制,请稍后重试");
}
// 2. 检查当日Token限制
String tokenKey = "rate:tokens:" + userId + ":" + getCurrentDate();
String tokenCountStr = redisTemplate.opsForValue().get(tokenKey);
long tokenCount = tokenCountStr != null ? Long.parseLong(tokenCountStr) : 0;
if (tokenCount + estimatedTokens > MAX_TOKENS_PER_DAY) {
log.warn("用户{}今日Token超限: {}", userId, tokenCount);
return RateLimitResult.rateLimited("今日使用量已达上限");
}
return RateLimitResult.allowed();
}
/**
* 记录一次调用(在LLM调用成功后调用)
*/
public void recordCall(String userId, int actualTokensUsed) {
long now = System.currentTimeMillis();
// 记录调用时间(用于滑动窗口)
String callKey = "rate:calls:" + userId;
redisTemplate.opsForZSet().add(callKey, String.valueOf(now), now);
redisTemplate.expire(callKey, Duration.ofMinutes(2));
// 累加当日Token消耗
String tokenKey = "rate:tokens:" + userId + ":" + getCurrentDate();
redisTemplate.opsForValue().increment(tokenKey, actualTokensUsed);
redisTemplate.expire(tokenKey, Duration.ofDays(2));
}
private String getCurrentDate() {
return LocalDate.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"));
}
public record RateLimitResult(boolean allowed, String message) {
public static RateLimitResult allowed() {
return new RateLimitResult(true, null);
}
public static RateLimitResult rateLimited(String message) {
return new RateLimitResult(false, message);
}
}
}用法三:分布式对话记忆
前面LangChain4j的文章里提到了Redis持久化ChatMemory。这里再说一个用法:跨服务的会话共享。
/**
* 分布式对话上下文
* 支持多个服务实例共享同一个用户的对话状态
*/
@Service
@RequiredArgsConstructor
public class DistributedConversationContext {
private final RedisTemplate<String, Object> redisTemplate;
private static final String CTX_KEY_PREFIX = "conversation:ctx:";
private static final Duration CTX_TTL = Duration.ofHours(2);
/**
* 保存当前对话中提取的关键上下文
* 比完整对话历史更精简,适合跨服务传递
*/
public void updateContext(String sessionId, ConversationContext ctx) {
String key = CTX_KEY_PREFIX + sessionId;
redisTemplate.opsForHash().putAll(key, ctx.toMap());
redisTemplate.expire(key, CTX_TTL);
}
public ConversationContext getContext(String sessionId) {
String key = CTX_KEY_PREFIX + sessionId;
Map<Object, Object> map = redisTemplate.opsForHash().entries(key);
return ConversationContext.fromMap(map);
}
@Data
@Builder
public static class ConversationContext {
private String userId;
private String lastOrderId; // 最近讨论的订单
private String lastProductId; // 最近讨论的商品
private String userIntent; // 当前意图
private String pendingConfirmation; // 待确认的操作
public Map<String, String> toMap() {
Map<String, String> map = new HashMap<>();
if (userId != null) map.put("userId", userId);
if (lastOrderId != null) map.put("lastOrderId", lastOrderId);
if (lastProductId != null) map.put("lastProductId", lastProductId);
if (userIntent != null) map.put("userIntent", userIntent);
if (pendingConfirmation != null) map.put("pendingConfirmation", pendingConfirmation);
return map;
}
public static ConversationContext fromMap(Map<Object, Object> map) {
return ConversationContext.builder()
.userId((String) map.get("userId"))
.lastOrderId((String) map.get("lastOrderId"))
.lastProductId((String) map.get("lastProductId"))
.userIntent((String) map.get("userIntent"))
.pendingConfirmation((String) map.get("pendingConfirmation"))
.build();
}
}
}用法四:LLM请求队列
高峰期LLM请求超出并发限制时,用Redis队列做缓冲:
/**
* LLM请求队列
* 高峰期请求入队,控制并发避免超出API限制
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class LlmRequestQueue {
private final StringRedisTemplate redisTemplate;
private final ChatLanguageModel llmModel;
private static final String QUEUE_KEY = "llm:request:queue";
private static final String RESULT_KEY_PREFIX = "llm:result:";
private static final int MAX_CONCURRENT = 5; // 最大并发LLM调用数
private final Semaphore concurrencySemaphore = new Semaphore(MAX_CONCURRENT);
/**
* 提交请求,返回请求ID
*/
public String submitRequest(String userId, String prompt) {
String requestId = UUID.randomUUID().toString();
LlmRequest request = new LlmRequest(requestId, userId, prompt,
System.currentTimeMillis());
redisTemplate.opsForList().rightPush(QUEUE_KEY, toJson(request));
log.info("LLM请求入队: requestId={}, 队列长度: {}",
requestId, redisTemplate.opsForList().size(QUEUE_KEY));
return requestId;
}
/**
* 查询请求结果(轮询接口)
*/
public Optional<String> getResult(String requestId) {
String resultKey = RESULT_KEY_PREFIX + requestId;
String result = redisTemplate.opsForValue().get(resultKey);
return Optional.ofNullable(result);
}
/**
* 队列消费者(后台运行)
*/
@Scheduled(fixedDelay = 100) // 每100ms轮询一次
public void processQueue() {
if (concurrencySemaphore.availablePermits() == 0) {
return; // 并发数已满,等待
}
String requestJson = redisTemplate.opsForList().leftPop(QUEUE_KEY);
if (requestJson == null) {
return; // 队列为空
}
LlmRequest request = fromJson(requestJson, LlmRequest.class);
// 异步处理
CompletableFuture.runAsync(() -> {
try {
concurrencySemaphore.acquire();
String answer = llmModel.generate(request.getPrompt());
// 结果存Redis,有效期10分钟
String resultKey = RESULT_KEY_PREFIX + request.getRequestId();
redisTemplate.opsForValue().set(resultKey, answer, Duration.ofMinutes(10));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} finally {
concurrencySemaphore.release();
}
});
}
@Data
@AllArgsConstructor
private static class LlmRequest {
private String requestId;
private String userId;
private String prompt;
private long submitTime;
}
}用法五:LLM调用结果的精确缓存
对于确定性的LLM调用(temperature=0,每次结果相同),可以做精确缓存:
/**
* 确定性LLM调用缓存
* 适用:数据提取、分类等temperature=0的场景
*/
@Aspect
@Component
@RequiredArgsConstructor
@Slf4j
public class DeterministicLlmCacheAspect {
private final StringRedisTemplate redisTemplate;
@Around("@annotation(cacheLlm)")
public Object cacheAroundLlmCall(ProceedingJoinPoint pjp,
CacheLlmResult cacheLlm) throws Throwable {
// 生成缓存key(方法名+参数的哈希)
String cacheKey = buildCacheKey(pjp);
// 查缓存
String cached = redisTemplate.opsForValue().get(cacheKey);
if (cached != null) {
log.debug("LLM结果缓存命中: {}", cacheKey);
return deserialize(cached, pjp.getSignature());
}
// 调用实际方法
Object result = pjp.proceed();
// 存缓存
String serialized = serialize(result);
Duration ttl = Duration.of(cacheLlm.ttlHours(), ChronoUnit.HOURS);
redisTemplate.opsForValue().set(cacheKey, serialized, ttl);
return result;
}
private String buildCacheKey(ProceedingJoinPoint pjp) {
String methodName = pjp.getSignature().toShortString();
String argsHash = DigestUtils.md5DigestAsHex(
Arrays.deepToString(pjp.getArgs()).getBytes(StandardCharsets.UTF_8));
return "llm-cache:" + methodName + ":" + argsHash;
}
}
@Target(ElementType.METHOD)
@Retention(RetentionPolicy.RUNTIME)
public @interface CacheLlmResult {
int ttlHours() default 24;
}
// 使用方式
@Service
public class TextAnalysisService {
private final ChatLanguageModel llm;
@CacheLlmResult(ttlHours = 48)
public TextCategory classify(String text) {
String result = llm.generate("对以下文本进行分类:" + text);
return TextCategory.valueOf(result.trim().toUpperCase());
}
}这五种用法,从成本优化到性能提升,覆盖了LLM应用中Redis的主要使用场景。优先级排序:语义缓存 > 速率限制 > 对话记忆 > 精确缓存 > 请求队列。根据你的场景选择性实现。
